context.dev 1.17.0 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +3 -3
- data/lib/context_dev/models/web_web_crawl_md_params.rb +53 -8
- data/lib/context_dev/models/web_web_crawl_md_response.rb +3 -2
- data/lib/context_dev/models/web_web_scrape_html_params.rb +42 -8
- data/lib/context_dev/models/web_web_scrape_md_params.rb +42 -8
- data/lib/context_dev/resources/web.rb +8 -8
- data/lib/context_dev/version.rb +1 -1
- data/rbi/context_dev/models/web_web_crawl_md_params.rbi +90 -13
- data/rbi/context_dev/models/web_web_crawl_md_response.rbi +4 -2
- data/rbi/context_dev/models/web_web_scrape_html_params.rbi +73 -13
- data/rbi/context_dev/models/web_web_scrape_md_params.rbi +73 -13
- data/rbi/context_dev/resources/web.rbi +18 -15
- data/sig/context_dev/models/web_web_crawl_md_params.rbs +38 -5
- data/sig/context_dev/models/web_web_scrape_html_params.rbs +31 -5
- data/sig/context_dev/models/web_web_scrape_md_params.rbs +31 -5
- data/sig/context_dev/resources/web.rbs +4 -3
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5a6ec86b2fdf43b0c3f9b800d8b93558a0c93fdf23fd72fad27a46ba43f2dccc
|
|
4
|
+
data.tar.gz: 8d5533fd69c08516b77b15b9bdcdfafea6cd18f4be1a77d17daa20bad2a6f9b1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e98fe1516e060eb780f470e8c5ddc29ba4fdea44c4b11ca40d79d2dfacc7009eb11c95360c19dc0f2864857cc2560ce91569d3587cdaacc8009f757b76a293a8
|
|
7
|
+
data.tar.gz: 515ce97f5f873d21cef0665843408f507ad5ee2221ae9308d336617e57b2980d0c8add55c46193f2e10c580819cf09654213d32d6f15558f17c6adc87114ef32
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 1.18.0 (2026-05-10)
|
|
4
|
+
|
|
5
|
+
Full Changelog: [v1.17.0...v1.18.0](https://github.com/context-dot-dev/context-ruby-sdk/compare/v1.17.0...v1.18.0)
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
* **api:** api update ([b582c05](https://github.com/context-dot-dev/context-ruby-sdk/commit/b582c05376102bb0cb6f8d4d8c9a2cefdef8c1ec))
|
|
10
|
+
* **api:** api update ([4a4e4bb](https://github.com/context-dot-dev/context-ruby-sdk/commit/4a4e4bbc547662de263a307b213dd7eecd03a61d))
|
|
11
|
+
* **api:** manual updates ([ec963bb](https://github.com/context-dot-dev/context-ruby-sdk/commit/ec963bb99ac36d162552c76fb067e87144f21089))
|
|
12
|
+
|
|
3
13
|
## 1.17.0 (2026-05-09)
|
|
4
14
|
|
|
5
15
|
Full Changelog: [v1.16.0...v1.17.0](https://github.com/context-dot-dev/context-ruby-sdk/compare/v1.16.0...v1.17.0)
|
data/README.md
CHANGED
|
@@ -8,8 +8,8 @@ It is generated with [Stainless](https://www.stainless.com/).
|
|
|
8
8
|
|
|
9
9
|
Use the Context Dev MCP Server to enable AI assistants to interact with this API, allowing them to explore endpoints, make test requests, and use documentation to help integrate this SDK into your application.
|
|
10
10
|
|
|
11
|
-
[](https://cursor.com/en-US/install-mcp?name=context
|
|
12
|
-
[](https://vscode.stainless.com/mcp/%7B%22name%22%3A%22context
|
|
11
|
+
[](https://cursor.com/en-US/install-mcp?name=context-dev-mcp&config=eyJuYW1lIjoiY29udGV4dC1kZXYtbWNwIiwidHJhbnNwb3J0IjoiaHR0cCIsInVybCI6Imh0dHBzOi8vY29udGV4dC1kZXYuc3RsbWNwLmNvbSIsImhlYWRlcnMiOnsieC1jb250ZXh0LWRldi1hcGkta2V5IjoiTXkgQVBJIEtleSJ9fQ)
|
|
12
|
+
[](https://vscode.stainless.com/mcp/%7B%22name%22%3A%22context-dev-mcp%22%2C%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fcontext-dev.stlmcp.com%22%2C%22headers%22%3A%7B%22x-context-dev-api-key%22%3A%22My%20API%20Key%22%7D%7D)
|
|
13
13
|
|
|
14
14
|
> Note: You may need to set environment variables in your MCP client.
|
|
15
15
|
|
|
@@ -26,7 +26,7 @@ To use this gem, install via Bundler by adding the following to your application
|
|
|
26
26
|
<!-- x-release-please-start-version -->
|
|
27
27
|
|
|
28
28
|
```ruby
|
|
29
|
-
gem "context.dev", "~> 1.
|
|
29
|
+
gem "context.dev", "~> 1.18.0"
|
|
30
30
|
```
|
|
31
31
|
|
|
32
32
|
<!-- x-release-please-end -->
|
|
@@ -60,13 +60,12 @@ module ContextDev
|
|
|
60
60
|
# @return [Integer, nil]
|
|
61
61
|
optional :max_pages, Integer, api_name: :maxPages
|
|
62
62
|
|
|
63
|
-
# @!attribute
|
|
64
|
-
#
|
|
65
|
-
#
|
|
66
|
-
# entirely (not included in results and not counted as failures).
|
|
63
|
+
# @!attribute pdf
|
|
64
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
65
|
+
# inclusive 1-based page range.
|
|
67
66
|
#
|
|
68
|
-
# @return [
|
|
69
|
-
optional :
|
|
67
|
+
# @return [ContextDev::Models::WebWebCrawlMdParams::Pdf, nil]
|
|
68
|
+
optional :pdf, -> { ContextDev::WebWebCrawlMdParams::Pdf }
|
|
70
69
|
|
|
71
70
|
# @!attribute shorten_base64_images
|
|
72
71
|
# Truncate base64-encoded image data in the Markdown output
|
|
@@ -74,6 +73,15 @@ module ContextDev
|
|
|
74
73
|
# @return [Boolean, nil]
|
|
75
74
|
optional :shorten_base64_images, ContextDev::Internal::Type::Boolean, api_name: :shortenBase64Images
|
|
76
75
|
|
|
76
|
+
# @!attribute stop_after_ms
|
|
77
|
+
# Soft time budget for the crawl in milliseconds. After each scrape, the crawler
|
|
78
|
+
# checks the elapsed time and, if exceeded, returns the pages collected so far
|
|
79
|
+
# instead of continuing. Min: 10000 (10s). Max: 240000 (4 min). Default: 120000 (2
|
|
80
|
+
# min).
|
|
81
|
+
#
|
|
82
|
+
# @return [Integer, nil]
|
|
83
|
+
optional :stop_after_ms, Integer, api_name: :stopAfterMs
|
|
84
|
+
|
|
77
85
|
# @!attribute timeout_ms
|
|
78
86
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
79
87
|
# than this value, it will be aborted with a 408 status code. Maximum allowed
|
|
@@ -102,7 +110,7 @@ module ContextDev
|
|
|
102
110
|
# @return [Integer, nil]
|
|
103
111
|
optional :wait_for_ms, Integer, api_name: :waitForMs
|
|
104
112
|
|
|
105
|
-
# @!method initialize(url:, follow_subdomains: nil, include_frames: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil,
|
|
113
|
+
# @!method initialize(url:, follow_subdomains: nil, include_frames: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil, pdf: nil, shorten_base64_images: nil, stop_after_ms: nil, timeout_ms: nil, url_regex: nil, use_main_content_only: nil, wait_for_ms: nil, request_options: {})
|
|
106
114
|
# Some parameter documentations has been truncated, see
|
|
107
115
|
# {ContextDev::Models::WebWebCrawlMdParams} for more details.
|
|
108
116
|
#
|
|
@@ -122,10 +130,12 @@ module ContextDev
|
|
|
122
130
|
#
|
|
123
131
|
# @param max_pages [Integer] Maximum number of pages to crawl. Hard cap: 500.
|
|
124
132
|
#
|
|
125
|
-
# @param
|
|
133
|
+
# @param pdf [ContextDev::Models::WebWebCrawlMdParams::Pdf] PDF parsing controls. Use start/end to limit text extraction and OCR to an inclu
|
|
126
134
|
#
|
|
127
135
|
# @param shorten_base64_images [Boolean] Truncate base64-encoded image data in the Markdown output
|
|
128
136
|
#
|
|
137
|
+
# @param stop_after_ms [Integer] Soft time budget for the crawl in milliseconds. After each scrape, the crawler c
|
|
138
|
+
#
|
|
129
139
|
# @param timeout_ms [Integer] Optional timeout in milliseconds for the request. If the request takes longer th
|
|
130
140
|
#
|
|
131
141
|
# @param url_regex [String] Regex pattern. Only URLs matching this pattern will be followed and scraped.
|
|
@@ -135,6 +145,41 @@ module ContextDev
|
|
|
135
145
|
# @param wait_for_ms [Integer] Optional browser wait time in milliseconds after initial page load for each craw
|
|
136
146
|
#
|
|
137
147
|
# @param request_options [ContextDev::RequestOptions, Hash{Symbol=>Object}]
|
|
148
|
+
|
|
149
|
+
class Pdf < ContextDev::Internal::Type::BaseModel
|
|
150
|
+
# @!attribute end_
|
|
151
|
+
# Last 1-based PDF page to parse. When omitted, parsing ends at the last page.
|
|
152
|
+
# Must be greater than or equal to start when both are provided.
|
|
153
|
+
#
|
|
154
|
+
# @return [Integer, nil]
|
|
155
|
+
optional :end_, Integer, api_name: :end
|
|
156
|
+
|
|
157
|
+
# @!attribute should_parse
|
|
158
|
+
# When true, PDF pages are fetched and parsed. When false, PDF pages are skipped
|
|
159
|
+
# entirely (not included in results and not counted as failures).
|
|
160
|
+
#
|
|
161
|
+
# @return [Boolean, nil]
|
|
162
|
+
optional :should_parse, ContextDev::Internal::Type::Boolean, api_name: :shouldParse
|
|
163
|
+
|
|
164
|
+
# @!attribute start
|
|
165
|
+
# First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
166
|
+
#
|
|
167
|
+
# @return [Integer, nil]
|
|
168
|
+
optional :start, Integer
|
|
169
|
+
|
|
170
|
+
# @!method initialize(end_: nil, should_parse: nil, start: nil)
|
|
171
|
+
# Some parameter documentations has been truncated, see
|
|
172
|
+
# {ContextDev::Models::WebWebCrawlMdParams::Pdf} for more details.
|
|
173
|
+
#
|
|
174
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
175
|
+
# inclusive 1-based page range.
|
|
176
|
+
#
|
|
177
|
+
# @param end_ [Integer] Last 1-based PDF page to parse. When omitted, parsing ends at the last page. Mus
|
|
178
|
+
#
|
|
179
|
+
# @param should_parse [Boolean] When true, PDF pages are fetched and parsed. When false, PDF pages are skipped e
|
|
180
|
+
#
|
|
181
|
+
# @param start [Integer] First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
182
|
+
end
|
|
138
183
|
end
|
|
139
184
|
end
|
|
140
185
|
end
|
|
@@ -34,7 +34,8 @@ module ContextDev
|
|
|
34
34
|
required :num_failed, Integer, api_name: :numFailed
|
|
35
35
|
|
|
36
36
|
# @!attribute num_skipped
|
|
37
|
-
# Number of URLs skipped (PDFs when
|
|
37
|
+
# Number of URLs skipped (PDFs when pdf.shouldParse=false, or URLs not matching
|
|
38
|
+
# urlRegex)
|
|
38
39
|
#
|
|
39
40
|
# @return [Integer]
|
|
40
41
|
required :num_skipped, Integer, api_name: :numSkipped
|
|
@@ -59,7 +60,7 @@ module ContextDev
|
|
|
59
60
|
#
|
|
60
61
|
# @param num_failed [Integer] Number of pages that failed to crawl
|
|
61
62
|
#
|
|
62
|
-
# @param num_skipped [Integer] Number of URLs skipped (PDFs when
|
|
63
|
+
# @param num_skipped [Integer] Number of URLs skipped (PDFs when pdf.shouldParse=false, or URLs not matching ur
|
|
63
64
|
#
|
|
64
65
|
# @param num_succeeded [Integer] Number of pages successfully crawled
|
|
65
66
|
#
|
|
@@ -27,13 +27,12 @@ module ContextDev
|
|
|
27
27
|
# @return [Integer, nil]
|
|
28
28
|
optional :max_age_ms, Integer
|
|
29
29
|
|
|
30
|
-
# @!attribute
|
|
31
|
-
#
|
|
32
|
-
#
|
|
33
|
-
# and a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
30
|
+
# @!attribute pdf
|
|
31
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
32
|
+
# inclusive 1-based page range.
|
|
34
33
|
#
|
|
35
|
-
# @return [
|
|
36
|
-
optional :
|
|
34
|
+
# @return [ContextDev::Models::WebWebScrapeHTMLParams::Pdf, nil]
|
|
35
|
+
optional :pdf, -> { ContextDev::WebWebScrapeHTMLParams::Pdf }
|
|
37
36
|
|
|
38
37
|
# @!attribute timeout_ms
|
|
39
38
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
@@ -50,7 +49,7 @@ module ContextDev
|
|
|
50
49
|
# @return [Integer, nil]
|
|
51
50
|
optional :wait_for_ms, Integer
|
|
52
51
|
|
|
53
|
-
# @!method initialize(url:, include_frames: nil, max_age_ms: nil,
|
|
52
|
+
# @!method initialize(url:, include_frames: nil, max_age_ms: nil, pdf: nil, timeout_ms: nil, wait_for_ms: nil, request_options: {})
|
|
54
53
|
# Some parameter documentations has been truncated, see
|
|
55
54
|
# {ContextDev::Models::WebWebScrapeHTMLParams} for more details.
|
|
56
55
|
#
|
|
@@ -60,13 +59,48 @@ module ContextDev
|
|
|
60
59
|
#
|
|
61
60
|
# @param max_age_ms [Integer] Return a cached result if a prior scrape for the same parameters exists and is y
|
|
62
61
|
#
|
|
63
|
-
# @param
|
|
62
|
+
# @param pdf [ContextDev::Models::WebWebScrapeHTMLParams::Pdf] PDF parsing controls. Use start/end to limit text extraction and OCR to an inclu
|
|
64
63
|
#
|
|
65
64
|
# @param timeout_ms [Integer] Optional timeout in milliseconds for the request. If the request takes longer th
|
|
66
65
|
#
|
|
67
66
|
# @param wait_for_ms [Integer] Optional browser wait time in milliseconds after initial page load. Min: 0. Max:
|
|
68
67
|
#
|
|
69
68
|
# @param request_options [ContextDev::RequestOptions, Hash{Symbol=>Object}]
|
|
69
|
+
|
|
70
|
+
class Pdf < ContextDev::Internal::Type::BaseModel
|
|
71
|
+
# @!attribute end_
|
|
72
|
+
# Last 1-based PDF page to parse. When omitted, parsing ends at the last page.
|
|
73
|
+
# Must be greater than or equal to start when both are provided.
|
|
74
|
+
#
|
|
75
|
+
# @return [Integer, nil]
|
|
76
|
+
optional :end_, Integer, api_name: :end
|
|
77
|
+
|
|
78
|
+
# @!attribute should_parse
|
|
79
|
+
# When true, PDF URLs are fetched and parsed. When false, PDF URLs are skipped and
|
|
80
|
+
# a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
81
|
+
#
|
|
82
|
+
# @return [Boolean, nil]
|
|
83
|
+
optional :should_parse, ContextDev::Internal::Type::Boolean, api_name: :shouldParse
|
|
84
|
+
|
|
85
|
+
# @!attribute start
|
|
86
|
+
# First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
87
|
+
#
|
|
88
|
+
# @return [Integer, nil]
|
|
89
|
+
optional :start, Integer
|
|
90
|
+
|
|
91
|
+
# @!method initialize(end_: nil, should_parse: nil, start: nil)
|
|
92
|
+
# Some parameter documentations has been truncated, see
|
|
93
|
+
# {ContextDev::Models::WebWebScrapeHTMLParams::Pdf} for more details.
|
|
94
|
+
#
|
|
95
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
96
|
+
# inclusive 1-based page range.
|
|
97
|
+
#
|
|
98
|
+
# @param end_ [Integer] Last 1-based PDF page to parse. When omitted, parsing ends at the last page. Mus
|
|
99
|
+
#
|
|
100
|
+
# @param should_parse [Boolean] When true, PDF URLs are fetched and parsed. When false, PDF URLs are skipped and
|
|
101
|
+
#
|
|
102
|
+
# @param start [Integer] First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
103
|
+
end
|
|
70
104
|
end
|
|
71
105
|
end
|
|
72
106
|
end
|
|
@@ -40,13 +40,12 @@ module ContextDev
|
|
|
40
40
|
# @return [Integer, nil]
|
|
41
41
|
optional :max_age_ms, Integer
|
|
42
42
|
|
|
43
|
-
# @!attribute
|
|
44
|
-
#
|
|
45
|
-
#
|
|
46
|
-
# WEBSITE_ACCESS_ERROR is returned.
|
|
43
|
+
# @!attribute pdf
|
|
44
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
45
|
+
# inclusive 1-based page range.
|
|
47
46
|
#
|
|
48
|
-
# @return [
|
|
49
|
-
optional :
|
|
47
|
+
# @return [ContextDev::Models::WebWebScrapeMdParams::Pdf, nil]
|
|
48
|
+
optional :pdf, -> { ContextDev::WebWebScrapeMdParams::Pdf }
|
|
50
49
|
|
|
51
50
|
# @!attribute shorten_base64_images
|
|
52
51
|
# Shorten base64-encoded image data in the Markdown output
|
|
@@ -76,7 +75,7 @@ module ContextDev
|
|
|
76
75
|
# @return [Integer, nil]
|
|
77
76
|
optional :wait_for_ms, Integer
|
|
78
77
|
|
|
79
|
-
# @!method initialize(url:, include_frames: nil, include_images: nil, include_links: nil, max_age_ms: nil,
|
|
78
|
+
# @!method initialize(url:, include_frames: nil, include_images: nil, include_links: nil, max_age_ms: nil, pdf: nil, shorten_base64_images: nil, timeout_ms: nil, use_main_content_only: nil, wait_for_ms: nil, request_options: {})
|
|
80
79
|
# Some parameter documentations has been truncated, see
|
|
81
80
|
# {ContextDev::Models::WebWebScrapeMdParams} for more details.
|
|
82
81
|
#
|
|
@@ -90,7 +89,7 @@ module ContextDev
|
|
|
90
89
|
#
|
|
91
90
|
# @param max_age_ms [Integer] Return a cached result if a prior scrape for the same parameters exists and is y
|
|
92
91
|
#
|
|
93
|
-
# @param
|
|
92
|
+
# @param pdf [ContextDev::Models::WebWebScrapeMdParams::Pdf] PDF parsing controls. Use start/end to limit text extraction and OCR to an inclu
|
|
94
93
|
#
|
|
95
94
|
# @param shorten_base64_images [Boolean] Shorten base64-encoded image data in the Markdown output
|
|
96
95
|
#
|
|
@@ -101,6 +100,41 @@ module ContextDev
|
|
|
101
100
|
# @param wait_for_ms [Integer] Optional browser wait time in milliseconds after initial page load before conver
|
|
102
101
|
#
|
|
103
102
|
# @param request_options [ContextDev::RequestOptions, Hash{Symbol=>Object}]
|
|
103
|
+
|
|
104
|
+
class Pdf < ContextDev::Internal::Type::BaseModel
|
|
105
|
+
# @!attribute end_
|
|
106
|
+
# Last 1-based PDF page to parse. When omitted, parsing ends at the last page.
|
|
107
|
+
# Must be greater than or equal to start when both are provided.
|
|
108
|
+
#
|
|
109
|
+
# @return [Integer, nil]
|
|
110
|
+
optional :end_, Integer, api_name: :end
|
|
111
|
+
|
|
112
|
+
# @!attribute should_parse
|
|
113
|
+
# When true, PDF URLs are fetched and parsed. When false, PDF URLs are skipped and
|
|
114
|
+
# a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
115
|
+
#
|
|
116
|
+
# @return [Boolean, nil]
|
|
117
|
+
optional :should_parse, ContextDev::Internal::Type::Boolean, api_name: :shouldParse
|
|
118
|
+
|
|
119
|
+
# @!attribute start
|
|
120
|
+
# First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
121
|
+
#
|
|
122
|
+
# @return [Integer, nil]
|
|
123
|
+
optional :start, Integer
|
|
124
|
+
|
|
125
|
+
# @!method initialize(end_: nil, should_parse: nil, start: nil)
|
|
126
|
+
# Some parameter documentations has been truncated, see
|
|
127
|
+
# {ContextDev::Models::WebWebScrapeMdParams::Pdf} for more details.
|
|
128
|
+
#
|
|
129
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
130
|
+
# inclusive 1-based page range.
|
|
131
|
+
#
|
|
132
|
+
# @param end_ [Integer] Last 1-based PDF page to parse. When omitted, parsing ends at the last page. Mus
|
|
133
|
+
#
|
|
134
|
+
# @param should_parse [Boolean] When true, PDF URLs are fetched and parsed. When false, PDF URLs are skipped and
|
|
135
|
+
#
|
|
136
|
+
# @param start [Integer] First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
137
|
+
end
|
|
104
138
|
end
|
|
105
139
|
end
|
|
106
140
|
end
|
|
@@ -117,7 +117,7 @@ module ContextDev
|
|
|
117
117
|
# Performs a crawl starting from a given URL, extracts page content as Markdown,
|
|
118
118
|
# and returns results for all crawled pages.
|
|
119
119
|
#
|
|
120
|
-
# @overload web_crawl_md(url:, follow_subdomains: nil, include_frames: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil,
|
|
120
|
+
# @overload web_crawl_md(url:, follow_subdomains: nil, include_frames: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil, pdf: nil, shorten_base64_images: nil, stop_after_ms: nil, timeout_ms: nil, url_regex: nil, use_main_content_only: nil, wait_for_ms: nil, request_options: {})
|
|
121
121
|
#
|
|
122
122
|
# @param url [String] The starting URL for the crawl (must include http:// or https:// protocol)
|
|
123
123
|
#
|
|
@@ -135,10 +135,12 @@ module ContextDev
|
|
|
135
135
|
#
|
|
136
136
|
# @param max_pages [Integer] Maximum number of pages to crawl. Hard cap: 500.
|
|
137
137
|
#
|
|
138
|
-
# @param
|
|
138
|
+
# @param pdf [ContextDev::Models::WebWebCrawlMdParams::Pdf] PDF parsing controls. Use start/end to limit text extraction and OCR to an inclu
|
|
139
139
|
#
|
|
140
140
|
# @param shorten_base64_images [Boolean] Truncate base64-encoded image data in the Markdown output
|
|
141
141
|
#
|
|
142
|
+
# @param stop_after_ms [Integer] Soft time budget for the crawl in milliseconds. After each scrape, the crawler c
|
|
143
|
+
#
|
|
142
144
|
# @param timeout_ms [Integer] Optional timeout in milliseconds for the request. If the request takes longer th
|
|
143
145
|
#
|
|
144
146
|
# @param url_regex [String] Regex pattern. Only URLs matching this pattern will be followed and scraped.
|
|
@@ -168,7 +170,7 @@ module ContextDev
|
|
|
168
170
|
#
|
|
169
171
|
# Scrapes the given URL and returns the raw HTML content of the page.
|
|
170
172
|
#
|
|
171
|
-
# @overload web_scrape_html(url:, include_frames: nil, max_age_ms: nil,
|
|
173
|
+
# @overload web_scrape_html(url:, include_frames: nil, max_age_ms: nil, pdf: nil, timeout_ms: nil, wait_for_ms: nil, request_options: {})
|
|
172
174
|
#
|
|
173
175
|
# @param url [String] Full URL to scrape (must include http:// or https:// protocol)
|
|
174
176
|
#
|
|
@@ -176,7 +178,7 @@ module ContextDev
|
|
|
176
178
|
#
|
|
177
179
|
# @param max_age_ms [Integer] Return a cached result if a prior scrape for the same parameters exists and is y
|
|
178
180
|
#
|
|
179
|
-
# @param
|
|
181
|
+
# @param pdf [ContextDev::Models::WebWebScrapeHTMLParams::Pdf] PDF parsing controls. Use start/end to limit text extraction and OCR to an inclu
|
|
180
182
|
#
|
|
181
183
|
# @param timeout_ms [Integer] Optional timeout in milliseconds for the request. If the request takes longer th
|
|
182
184
|
#
|
|
@@ -196,7 +198,6 @@ module ContextDev
|
|
|
196
198
|
query: query.transform_keys(
|
|
197
199
|
include_frames: "includeFrames",
|
|
198
200
|
max_age_ms: "maxAgeMs",
|
|
199
|
-
parse_pdf: "parsePDF",
|
|
200
201
|
timeout_ms: "timeoutMS",
|
|
201
202
|
wait_for_ms: "waitForMs"
|
|
202
203
|
),
|
|
@@ -251,7 +252,7 @@ module ContextDev
|
|
|
251
252
|
#
|
|
252
253
|
# Scrapes the given URL into LLM usable Markdown.
|
|
253
254
|
#
|
|
254
|
-
# @overload web_scrape_md(url:, include_frames: nil, include_images: nil, include_links: nil, max_age_ms: nil,
|
|
255
|
+
# @overload web_scrape_md(url:, include_frames: nil, include_images: nil, include_links: nil, max_age_ms: nil, pdf: nil, shorten_base64_images: nil, timeout_ms: nil, use_main_content_only: nil, wait_for_ms: nil, request_options: {})
|
|
255
256
|
#
|
|
256
257
|
# @param url [String] Full URL to scrape into LLM usable Markdown (must include http:// or https:// pr
|
|
257
258
|
#
|
|
@@ -263,7 +264,7 @@ module ContextDev
|
|
|
263
264
|
#
|
|
264
265
|
# @param max_age_ms [Integer] Return a cached result if a prior scrape for the same parameters exists and is y
|
|
265
266
|
#
|
|
266
|
-
# @param
|
|
267
|
+
# @param pdf [ContextDev::Models::WebWebScrapeMdParams::Pdf] PDF parsing controls. Use start/end to limit text extraction and OCR to an inclu
|
|
267
268
|
#
|
|
268
269
|
# @param shorten_base64_images [Boolean] Shorten base64-encoded image data in the Markdown output
|
|
269
270
|
#
|
|
@@ -289,7 +290,6 @@ module ContextDev
|
|
|
289
290
|
include_images: "includeImages",
|
|
290
291
|
include_links: "includeLinks",
|
|
291
292
|
max_age_ms: "maxAgeMs",
|
|
292
|
-
parse_pdf: "parsePDF",
|
|
293
293
|
shorten_base64_images: "shortenBase64Images",
|
|
294
294
|
timeout_ms: "timeoutMS",
|
|
295
295
|
use_main_content_only: "useMainContentOnly",
|
data/lib/context_dev/version.rb
CHANGED
|
@@ -69,14 +69,13 @@ module ContextDev
|
|
|
69
69
|
sig { params(max_pages: Integer).void }
|
|
70
70
|
attr_writer :max_pages
|
|
71
71
|
|
|
72
|
-
#
|
|
73
|
-
#
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
attr_reader :parse_pdf
|
|
72
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
73
|
+
# inclusive 1-based page range.
|
|
74
|
+
sig { returns(T.nilable(ContextDev::WebWebCrawlMdParams::Pdf)) }
|
|
75
|
+
attr_reader :pdf
|
|
77
76
|
|
|
78
|
-
sig { params(
|
|
79
|
-
attr_writer :
|
|
77
|
+
sig { params(pdf: ContextDev::WebWebCrawlMdParams::Pdf::OrHash).void }
|
|
78
|
+
attr_writer :pdf
|
|
80
79
|
|
|
81
80
|
# Truncate base64-encoded image data in the Markdown output
|
|
82
81
|
sig { returns(T.nilable(T::Boolean)) }
|
|
@@ -85,6 +84,16 @@ module ContextDev
|
|
|
85
84
|
sig { params(shorten_base64_images: T::Boolean).void }
|
|
86
85
|
attr_writer :shorten_base64_images
|
|
87
86
|
|
|
87
|
+
# Soft time budget for the crawl in milliseconds. After each scrape, the crawler
|
|
88
|
+
# checks the elapsed time and, if exceeded, returns the pages collected so far
|
|
89
|
+
# instead of continuing. Min: 10000 (10s). Max: 240000 (4 min). Default: 120000 (2
|
|
90
|
+
# min).
|
|
91
|
+
sig { returns(T.nilable(Integer)) }
|
|
92
|
+
attr_reader :stop_after_ms
|
|
93
|
+
|
|
94
|
+
sig { params(stop_after_ms: Integer).void }
|
|
95
|
+
attr_writer :stop_after_ms
|
|
96
|
+
|
|
88
97
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
89
98
|
# than this value, it will be aborted with a 408 status code. Maximum allowed
|
|
90
99
|
# value is 300000ms (5 minutes).
|
|
@@ -127,8 +136,9 @@ module ContextDev
|
|
|
127
136
|
max_age_ms: Integer,
|
|
128
137
|
max_depth: Integer,
|
|
129
138
|
max_pages: Integer,
|
|
130
|
-
|
|
139
|
+
pdf: ContextDev::WebWebCrawlMdParams::Pdf::OrHash,
|
|
131
140
|
shorten_base64_images: T::Boolean,
|
|
141
|
+
stop_after_ms: Integer,
|
|
132
142
|
timeout_ms: Integer,
|
|
133
143
|
url_regex: String,
|
|
134
144
|
use_main_content_only: T::Boolean,
|
|
@@ -158,12 +168,16 @@ module ContextDev
|
|
|
158
168
|
max_depth: nil,
|
|
159
169
|
# Maximum number of pages to crawl. Hard cap: 500.
|
|
160
170
|
max_pages: nil,
|
|
161
|
-
#
|
|
162
|
-
#
|
|
163
|
-
|
|
164
|
-
parse_pdf: nil,
|
|
171
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
172
|
+
# inclusive 1-based page range.
|
|
173
|
+
pdf: nil,
|
|
165
174
|
# Truncate base64-encoded image data in the Markdown output
|
|
166
175
|
shorten_base64_images: nil,
|
|
176
|
+
# Soft time budget for the crawl in milliseconds. After each scrape, the crawler
|
|
177
|
+
# checks the elapsed time and, if exceeded, returns the pages collected so far
|
|
178
|
+
# instead of continuing. Min: 10000 (10s). Max: 240000 (4 min). Default: 120000 (2
|
|
179
|
+
# min).
|
|
180
|
+
stop_after_ms: nil,
|
|
167
181
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
168
182
|
# than this value, it will be aborted with a 408 status code. Maximum allowed
|
|
169
183
|
# value is 300000ms (5 minutes).
|
|
@@ -191,8 +205,9 @@ module ContextDev
|
|
|
191
205
|
max_age_ms: Integer,
|
|
192
206
|
max_depth: Integer,
|
|
193
207
|
max_pages: Integer,
|
|
194
|
-
|
|
208
|
+
pdf: ContextDev::WebWebCrawlMdParams::Pdf,
|
|
195
209
|
shorten_base64_images: T::Boolean,
|
|
210
|
+
stop_after_ms: Integer,
|
|
196
211
|
timeout_ms: Integer,
|
|
197
212
|
url_regex: String,
|
|
198
213
|
use_main_content_only: T::Boolean,
|
|
@@ -203,6 +218,68 @@ module ContextDev
|
|
|
203
218
|
end
|
|
204
219
|
def to_hash
|
|
205
220
|
end
|
|
221
|
+
|
|
222
|
+
class Pdf < ContextDev::Internal::Type::BaseModel
|
|
223
|
+
OrHash =
|
|
224
|
+
T.type_alias do
|
|
225
|
+
T.any(
|
|
226
|
+
ContextDev::WebWebCrawlMdParams::Pdf,
|
|
227
|
+
ContextDev::Internal::AnyHash
|
|
228
|
+
)
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Last 1-based PDF page to parse. When omitted, parsing ends at the last page.
|
|
232
|
+
# Must be greater than or equal to start when both are provided.
|
|
233
|
+
sig { returns(T.nilable(Integer)) }
|
|
234
|
+
attr_reader :end_
|
|
235
|
+
|
|
236
|
+
sig { params(end_: Integer).void }
|
|
237
|
+
attr_writer :end_
|
|
238
|
+
|
|
239
|
+
# When true, PDF pages are fetched and parsed. When false, PDF pages are skipped
|
|
240
|
+
# entirely (not included in results and not counted as failures).
|
|
241
|
+
sig { returns(T.nilable(T::Boolean)) }
|
|
242
|
+
attr_reader :should_parse
|
|
243
|
+
|
|
244
|
+
sig { params(should_parse: T::Boolean).void }
|
|
245
|
+
attr_writer :should_parse
|
|
246
|
+
|
|
247
|
+
# First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
248
|
+
sig { returns(T.nilable(Integer)) }
|
|
249
|
+
attr_reader :start
|
|
250
|
+
|
|
251
|
+
sig { params(start: Integer).void }
|
|
252
|
+
attr_writer :start
|
|
253
|
+
|
|
254
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
255
|
+
# inclusive 1-based page range.
|
|
256
|
+
sig do
|
|
257
|
+
params(
|
|
258
|
+
end_: Integer,
|
|
259
|
+
should_parse: T::Boolean,
|
|
260
|
+
start: Integer
|
|
261
|
+
).returns(T.attached_class)
|
|
262
|
+
end
|
|
263
|
+
def self.new(
|
|
264
|
+
# Last 1-based PDF page to parse. When omitted, parsing ends at the last page.
|
|
265
|
+
# Must be greater than or equal to start when both are provided.
|
|
266
|
+
end_: nil,
|
|
267
|
+
# When true, PDF pages are fetched and parsed. When false, PDF pages are skipped
|
|
268
|
+
# entirely (not included in results and not counted as failures).
|
|
269
|
+
should_parse: nil,
|
|
270
|
+
# First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
271
|
+
start: nil
|
|
272
|
+
)
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
sig do
|
|
276
|
+
override.returns(
|
|
277
|
+
{ end_: Integer, should_parse: T::Boolean, start: Integer }
|
|
278
|
+
)
|
|
279
|
+
end
|
|
280
|
+
def to_hash
|
|
281
|
+
end
|
|
282
|
+
end
|
|
206
283
|
end
|
|
207
284
|
end
|
|
208
285
|
end
|
|
@@ -64,7 +64,8 @@ module ContextDev
|
|
|
64
64
|
sig { returns(Integer) }
|
|
65
65
|
attr_accessor :num_failed
|
|
66
66
|
|
|
67
|
-
# Number of URLs skipped (PDFs when
|
|
67
|
+
# Number of URLs skipped (PDFs when pdf.shouldParse=false, or URLs not matching
|
|
68
|
+
# urlRegex)
|
|
68
69
|
sig { returns(Integer) }
|
|
69
70
|
attr_accessor :num_skipped
|
|
70
71
|
|
|
@@ -90,7 +91,8 @@ module ContextDev
|
|
|
90
91
|
max_crawl_depth:,
|
|
91
92
|
# Number of pages that failed to crawl
|
|
92
93
|
num_failed:,
|
|
93
|
-
# Number of URLs skipped (PDFs when
|
|
94
|
+
# Number of URLs skipped (PDFs when pdf.shouldParse=false, or URLs not matching
|
|
95
|
+
# urlRegex)
|
|
94
96
|
num_skipped:,
|
|
95
97
|
# Number of pages successfully crawled
|
|
96
98
|
num_succeeded:,
|
|
@@ -34,14 +34,13 @@ module ContextDev
|
|
|
34
34
|
sig { params(max_age_ms: Integer).void }
|
|
35
35
|
attr_writer :max_age_ms
|
|
36
36
|
|
|
37
|
-
#
|
|
38
|
-
#
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
attr_reader :parse_pdf
|
|
37
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
38
|
+
# inclusive 1-based page range.
|
|
39
|
+
sig { returns(T.nilable(ContextDev::WebWebScrapeHTMLParams::Pdf)) }
|
|
40
|
+
attr_reader :pdf
|
|
42
41
|
|
|
43
|
-
sig { params(
|
|
44
|
-
attr_writer :
|
|
42
|
+
sig { params(pdf: ContextDev::WebWebScrapeHTMLParams::Pdf::OrHash).void }
|
|
43
|
+
attr_writer :pdf
|
|
45
44
|
|
|
46
45
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
47
46
|
# than this value, it will be aborted with a 408 status code. Maximum allowed
|
|
@@ -65,7 +64,7 @@ module ContextDev
|
|
|
65
64
|
url: String,
|
|
66
65
|
include_frames: T::Boolean,
|
|
67
66
|
max_age_ms: Integer,
|
|
68
|
-
|
|
67
|
+
pdf: ContextDev::WebWebScrapeHTMLParams::Pdf::OrHash,
|
|
69
68
|
timeout_ms: Integer,
|
|
70
69
|
wait_for_ms: Integer,
|
|
71
70
|
request_options: ContextDev::RequestOptions::OrHash
|
|
@@ -80,10 +79,9 @@ module ContextDev
|
|
|
80
79
|
# younger than this many milliseconds. Defaults to 1 day (86400000 ms) when
|
|
81
80
|
# omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.
|
|
82
81
|
max_age_ms: nil,
|
|
83
|
-
#
|
|
84
|
-
#
|
|
85
|
-
|
|
86
|
-
parse_pdf: nil,
|
|
82
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
83
|
+
# inclusive 1-based page range.
|
|
84
|
+
pdf: nil,
|
|
87
85
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
88
86
|
# than this value, it will be aborted with a 408 status code. Maximum allowed
|
|
89
87
|
# value is 300000ms (5 minutes).
|
|
@@ -101,7 +99,7 @@ module ContextDev
|
|
|
101
99
|
url: String,
|
|
102
100
|
include_frames: T::Boolean,
|
|
103
101
|
max_age_ms: Integer,
|
|
104
|
-
|
|
102
|
+
pdf: ContextDev::WebWebScrapeHTMLParams::Pdf,
|
|
105
103
|
timeout_ms: Integer,
|
|
106
104
|
wait_for_ms: Integer,
|
|
107
105
|
request_options: ContextDev::RequestOptions
|
|
@@ -110,6 +108,68 @@ module ContextDev
|
|
|
110
108
|
end
|
|
111
109
|
def to_hash
|
|
112
110
|
end
|
|
111
|
+
|
|
112
|
+
class Pdf < ContextDev::Internal::Type::BaseModel
|
|
113
|
+
OrHash =
|
|
114
|
+
T.type_alias do
|
|
115
|
+
T.any(
|
|
116
|
+
ContextDev::WebWebScrapeHTMLParams::Pdf,
|
|
117
|
+
ContextDev::Internal::AnyHash
|
|
118
|
+
)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Last 1-based PDF page to parse. When omitted, parsing ends at the last page.
|
|
122
|
+
# Must be greater than or equal to start when both are provided.
|
|
123
|
+
sig { returns(T.nilable(Integer)) }
|
|
124
|
+
attr_reader :end_
|
|
125
|
+
|
|
126
|
+
sig { params(end_: Integer).void }
|
|
127
|
+
attr_writer :end_
|
|
128
|
+
|
|
129
|
+
# When true, PDF URLs are fetched and parsed. When false, PDF URLs are skipped and
|
|
130
|
+
# a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
131
|
+
sig { returns(T.nilable(T::Boolean)) }
|
|
132
|
+
attr_reader :should_parse
|
|
133
|
+
|
|
134
|
+
sig { params(should_parse: T::Boolean).void }
|
|
135
|
+
attr_writer :should_parse
|
|
136
|
+
|
|
137
|
+
# First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
138
|
+
sig { returns(T.nilable(Integer)) }
|
|
139
|
+
attr_reader :start
|
|
140
|
+
|
|
141
|
+
sig { params(start: Integer).void }
|
|
142
|
+
attr_writer :start
|
|
143
|
+
|
|
144
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
145
|
+
# inclusive 1-based page range.
|
|
146
|
+
sig do
|
|
147
|
+
params(
|
|
148
|
+
end_: Integer,
|
|
149
|
+
should_parse: T::Boolean,
|
|
150
|
+
start: Integer
|
|
151
|
+
).returns(T.attached_class)
|
|
152
|
+
end
|
|
153
|
+
def self.new(
|
|
154
|
+
# Last 1-based PDF page to parse. When omitted, parsing ends at the last page.
|
|
155
|
+
# Must be greater than or equal to start when both are provided.
|
|
156
|
+
end_: nil,
|
|
157
|
+
# When true, PDF URLs are fetched and parsed. When false, PDF URLs are skipped and
|
|
158
|
+
# a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
159
|
+
should_parse: nil,
|
|
160
|
+
# First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
161
|
+
start: nil
|
|
162
|
+
)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
sig do
|
|
166
|
+
override.returns(
|
|
167
|
+
{ end_: Integer, should_parse: T::Boolean, start: Integer }
|
|
168
|
+
)
|
|
169
|
+
end
|
|
170
|
+
def to_hash
|
|
171
|
+
end
|
|
172
|
+
end
|
|
113
173
|
end
|
|
114
174
|
end
|
|
115
175
|
end
|
|
@@ -46,14 +46,13 @@ module ContextDev
|
|
|
46
46
|
sig { params(max_age_ms: Integer).void }
|
|
47
47
|
attr_writer :max_age_ms
|
|
48
48
|
|
|
49
|
-
#
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
attr_reader :parse_pdf
|
|
49
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
50
|
+
# inclusive 1-based page range.
|
|
51
|
+
sig { returns(T.nilable(ContextDev::WebWebScrapeMdParams::Pdf)) }
|
|
52
|
+
attr_reader :pdf
|
|
54
53
|
|
|
55
|
-
sig { params(
|
|
56
|
-
attr_writer :
|
|
54
|
+
sig { params(pdf: ContextDev::WebWebScrapeMdParams::Pdf::OrHash).void }
|
|
55
|
+
attr_writer :pdf
|
|
57
56
|
|
|
58
57
|
# Shorten base64-encoded image data in the Markdown output
|
|
59
58
|
sig { returns(T.nilable(T::Boolean)) }
|
|
@@ -94,7 +93,7 @@ module ContextDev
|
|
|
94
93
|
include_images: T::Boolean,
|
|
95
94
|
include_links: T::Boolean,
|
|
96
95
|
max_age_ms: Integer,
|
|
97
|
-
|
|
96
|
+
pdf: ContextDev::WebWebScrapeMdParams::Pdf::OrHash,
|
|
98
97
|
shorten_base64_images: T::Boolean,
|
|
99
98
|
timeout_ms: Integer,
|
|
100
99
|
use_main_content_only: T::Boolean,
|
|
@@ -116,10 +115,9 @@ module ContextDev
|
|
|
116
115
|
# younger than this many milliseconds. Defaults to 1 day (86400000 ms) when
|
|
117
116
|
# omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.
|
|
118
117
|
max_age_ms: nil,
|
|
119
|
-
#
|
|
120
|
-
#
|
|
121
|
-
|
|
122
|
-
parse_pdf: nil,
|
|
118
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
119
|
+
# inclusive 1-based page range.
|
|
120
|
+
pdf: nil,
|
|
123
121
|
# Shorten base64-encoded image data in the Markdown output
|
|
124
122
|
shorten_base64_images: nil,
|
|
125
123
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
@@ -144,7 +142,7 @@ module ContextDev
|
|
|
144
142
|
include_images: T::Boolean,
|
|
145
143
|
include_links: T::Boolean,
|
|
146
144
|
max_age_ms: Integer,
|
|
147
|
-
|
|
145
|
+
pdf: ContextDev::WebWebScrapeMdParams::Pdf,
|
|
148
146
|
shorten_base64_images: T::Boolean,
|
|
149
147
|
timeout_ms: Integer,
|
|
150
148
|
use_main_content_only: T::Boolean,
|
|
@@ -155,6 +153,68 @@ module ContextDev
|
|
|
155
153
|
end
|
|
156
154
|
def to_hash
|
|
157
155
|
end
|
|
156
|
+
|
|
157
|
+
class Pdf < ContextDev::Internal::Type::BaseModel
|
|
158
|
+
OrHash =
|
|
159
|
+
T.type_alias do
|
|
160
|
+
T.any(
|
|
161
|
+
ContextDev::WebWebScrapeMdParams::Pdf,
|
|
162
|
+
ContextDev::Internal::AnyHash
|
|
163
|
+
)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Last 1-based PDF page to parse. When omitted, parsing ends at the last page.
|
|
167
|
+
# Must be greater than or equal to start when both are provided.
|
|
168
|
+
sig { returns(T.nilable(Integer)) }
|
|
169
|
+
attr_reader :end_
|
|
170
|
+
|
|
171
|
+
sig { params(end_: Integer).void }
|
|
172
|
+
attr_writer :end_
|
|
173
|
+
|
|
174
|
+
# When true, PDF URLs are fetched and parsed. When false, PDF URLs are skipped and
|
|
175
|
+
# a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
176
|
+
sig { returns(T.nilable(T::Boolean)) }
|
|
177
|
+
attr_reader :should_parse
|
|
178
|
+
|
|
179
|
+
sig { params(should_parse: T::Boolean).void }
|
|
180
|
+
attr_writer :should_parse
|
|
181
|
+
|
|
182
|
+
# First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
183
|
+
sig { returns(T.nilable(Integer)) }
|
|
184
|
+
attr_reader :start
|
|
185
|
+
|
|
186
|
+
sig { params(start: Integer).void }
|
|
187
|
+
attr_writer :start
|
|
188
|
+
|
|
189
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
190
|
+
# inclusive 1-based page range.
|
|
191
|
+
sig do
|
|
192
|
+
params(
|
|
193
|
+
end_: Integer,
|
|
194
|
+
should_parse: T::Boolean,
|
|
195
|
+
start: Integer
|
|
196
|
+
).returns(T.attached_class)
|
|
197
|
+
end
|
|
198
|
+
def self.new(
|
|
199
|
+
# Last 1-based PDF page to parse. When omitted, parsing ends at the last page.
|
|
200
|
+
# Must be greater than or equal to start when both are provided.
|
|
201
|
+
end_: nil,
|
|
202
|
+
# When true, PDF URLs are fetched and parsed. When false, PDF URLs are skipped and
|
|
203
|
+
# a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
204
|
+
should_parse: nil,
|
|
205
|
+
# First 1-based PDF page to parse. When omitted, parsing starts at the first page.
|
|
206
|
+
start: nil
|
|
207
|
+
)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
sig do
|
|
211
|
+
override.returns(
|
|
212
|
+
{ end_: Integer, should_parse: T::Boolean, start: Integer }
|
|
213
|
+
)
|
|
214
|
+
end
|
|
215
|
+
def to_hash
|
|
216
|
+
end
|
|
217
|
+
end
|
|
158
218
|
end
|
|
159
219
|
end
|
|
160
220
|
end
|
|
@@ -122,8 +122,9 @@ module ContextDev
|
|
|
122
122
|
max_age_ms: Integer,
|
|
123
123
|
max_depth: Integer,
|
|
124
124
|
max_pages: Integer,
|
|
125
|
-
|
|
125
|
+
pdf: ContextDev::WebWebCrawlMdParams::Pdf::OrHash,
|
|
126
126
|
shorten_base64_images: T::Boolean,
|
|
127
|
+
stop_after_ms: Integer,
|
|
127
128
|
timeout_ms: Integer,
|
|
128
129
|
url_regex: String,
|
|
129
130
|
use_main_content_only: T::Boolean,
|
|
@@ -153,12 +154,16 @@ module ContextDev
|
|
|
153
154
|
max_depth: nil,
|
|
154
155
|
# Maximum number of pages to crawl. Hard cap: 500.
|
|
155
156
|
max_pages: nil,
|
|
156
|
-
#
|
|
157
|
-
#
|
|
158
|
-
|
|
159
|
-
parse_pdf: nil,
|
|
157
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
158
|
+
# inclusive 1-based page range.
|
|
159
|
+
pdf: nil,
|
|
160
160
|
# Truncate base64-encoded image data in the Markdown output
|
|
161
161
|
shorten_base64_images: nil,
|
|
162
|
+
# Soft time budget for the crawl in milliseconds. After each scrape, the crawler
|
|
163
|
+
# checks the elapsed time and, if exceeded, returns the pages collected so far
|
|
164
|
+
# instead of continuing. Min: 10000 (10s). Max: 240000 (4 min). Default: 120000 (2
|
|
165
|
+
# min).
|
|
166
|
+
stop_after_ms: nil,
|
|
162
167
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
163
168
|
# than this value, it will be aborted with a 408 status code. Maximum allowed
|
|
164
169
|
# value is 300000ms (5 minutes).
|
|
@@ -181,7 +186,7 @@ module ContextDev
|
|
|
181
186
|
url: String,
|
|
182
187
|
include_frames: T::Boolean,
|
|
183
188
|
max_age_ms: Integer,
|
|
184
|
-
|
|
189
|
+
pdf: ContextDev::WebWebScrapeHTMLParams::Pdf::OrHash,
|
|
185
190
|
timeout_ms: Integer,
|
|
186
191
|
wait_for_ms: Integer,
|
|
187
192
|
request_options: ContextDev::RequestOptions::OrHash
|
|
@@ -196,10 +201,9 @@ module ContextDev
|
|
|
196
201
|
# younger than this many milliseconds. Defaults to 1 day (86400000 ms) when
|
|
197
202
|
# omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.
|
|
198
203
|
max_age_ms: nil,
|
|
199
|
-
#
|
|
200
|
-
#
|
|
201
|
-
|
|
202
|
-
parse_pdf: nil,
|
|
204
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
205
|
+
# inclusive 1-based page range.
|
|
206
|
+
pdf: nil,
|
|
203
207
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
204
208
|
# than this value, it will be aborted with a 408 status code. Maximum allowed
|
|
205
209
|
# value is 300000ms (5 minutes).
|
|
@@ -253,7 +257,7 @@ module ContextDev
|
|
|
253
257
|
include_images: T::Boolean,
|
|
254
258
|
include_links: T::Boolean,
|
|
255
259
|
max_age_ms: Integer,
|
|
256
|
-
|
|
260
|
+
pdf: ContextDev::WebWebScrapeMdParams::Pdf::OrHash,
|
|
257
261
|
shorten_base64_images: T::Boolean,
|
|
258
262
|
timeout_ms: Integer,
|
|
259
263
|
use_main_content_only: T::Boolean,
|
|
@@ -275,10 +279,9 @@ module ContextDev
|
|
|
275
279
|
# younger than this many milliseconds. Defaults to 1 day (86400000 ms) when
|
|
276
280
|
# omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.
|
|
277
281
|
max_age_ms: nil,
|
|
278
|
-
#
|
|
279
|
-
#
|
|
280
|
-
|
|
281
|
-
parse_pdf: nil,
|
|
282
|
+
# PDF parsing controls. Use start/end to limit text extraction and OCR to an
|
|
283
|
+
# inclusive 1-based page range.
|
|
284
|
+
pdf: nil,
|
|
282
285
|
# Shorten base64-encoded image data in the Markdown output
|
|
283
286
|
shorten_base64_images: nil,
|
|
284
287
|
# Optional timeout in milliseconds for the request. If the request takes longer
|
|
@@ -10,8 +10,9 @@ module ContextDev
|
|
|
10
10
|
max_age_ms: Integer,
|
|
11
11
|
max_depth: Integer,
|
|
12
12
|
max_pages: Integer,
|
|
13
|
-
|
|
13
|
+
pdf: ContextDev::WebWebCrawlMdParams::Pdf,
|
|
14
14
|
:shorten_base64_images => bool,
|
|
15
|
+
stop_after_ms: Integer,
|
|
15
16
|
timeout_ms: Integer,
|
|
16
17
|
url_regex: String,
|
|
17
18
|
use_main_content_only: bool,
|
|
@@ -53,14 +54,20 @@ module ContextDev
|
|
|
53
54
|
|
|
54
55
|
def max_pages=: (Integer) -> Integer
|
|
55
56
|
|
|
56
|
-
attr_reader
|
|
57
|
+
attr_reader pdf: ContextDev::WebWebCrawlMdParams::Pdf?
|
|
57
58
|
|
|
58
|
-
def
|
|
59
|
+
def pdf=: (
|
|
60
|
+
ContextDev::WebWebCrawlMdParams::Pdf
|
|
61
|
+
) -> ContextDev::WebWebCrawlMdParams::Pdf
|
|
59
62
|
|
|
60
63
|
attr_reader shorten_base64_images: bool?
|
|
61
64
|
|
|
62
65
|
def shorten_base64_images=: (bool) -> bool
|
|
63
66
|
|
|
67
|
+
attr_reader stop_after_ms: Integer?
|
|
68
|
+
|
|
69
|
+
def stop_after_ms=: (Integer) -> Integer
|
|
70
|
+
|
|
64
71
|
attr_reader timeout_ms: Integer?
|
|
65
72
|
|
|
66
73
|
def timeout_ms=: (Integer) -> Integer
|
|
@@ -86,8 +93,9 @@ module ContextDev
|
|
|
86
93
|
?max_age_ms: Integer,
|
|
87
94
|
?max_depth: Integer,
|
|
88
95
|
?max_pages: Integer,
|
|
89
|
-
?
|
|
96
|
+
?pdf: ContextDev::WebWebCrawlMdParams::Pdf,
|
|
90
97
|
?shorten_base64_images: bool,
|
|
98
|
+
?stop_after_ms: Integer,
|
|
91
99
|
?timeout_ms: Integer,
|
|
92
100
|
?url_regex: String,
|
|
93
101
|
?use_main_content_only: bool,
|
|
@@ -104,14 +112,39 @@ module ContextDev
|
|
|
104
112
|
max_age_ms: Integer,
|
|
105
113
|
max_depth: Integer,
|
|
106
114
|
max_pages: Integer,
|
|
107
|
-
|
|
115
|
+
pdf: ContextDev::WebWebCrawlMdParams::Pdf,
|
|
108
116
|
:shorten_base64_images => bool,
|
|
117
|
+
stop_after_ms: Integer,
|
|
109
118
|
timeout_ms: Integer,
|
|
110
119
|
url_regex: String,
|
|
111
120
|
use_main_content_only: bool,
|
|
112
121
|
wait_for_ms: Integer,
|
|
113
122
|
request_options: ContextDev::RequestOptions
|
|
114
123
|
}
|
|
124
|
+
|
|
125
|
+
type pdf = { end_: Integer, should_parse: bool, start: Integer }
|
|
126
|
+
|
|
127
|
+
class Pdf < ContextDev::Internal::Type::BaseModel
|
|
128
|
+
attr_reader end_: Integer?
|
|
129
|
+
|
|
130
|
+
def end_=: (Integer) -> Integer
|
|
131
|
+
|
|
132
|
+
attr_reader should_parse: bool?
|
|
133
|
+
|
|
134
|
+
def should_parse=: (bool) -> bool
|
|
135
|
+
|
|
136
|
+
attr_reader start: Integer?
|
|
137
|
+
|
|
138
|
+
def start=: (Integer) -> Integer
|
|
139
|
+
|
|
140
|
+
def initialize: (
|
|
141
|
+
?end_: Integer,
|
|
142
|
+
?should_parse: bool,
|
|
143
|
+
?start: Integer
|
|
144
|
+
) -> void
|
|
145
|
+
|
|
146
|
+
def to_hash: -> { end_: Integer, should_parse: bool, start: Integer }
|
|
147
|
+
end
|
|
115
148
|
end
|
|
116
149
|
end
|
|
117
150
|
end
|
|
@@ -5,7 +5,7 @@ module ContextDev
|
|
|
5
5
|
url: String,
|
|
6
6
|
include_frames: bool,
|
|
7
7
|
max_age_ms: Integer,
|
|
8
|
-
|
|
8
|
+
pdf: ContextDev::WebWebScrapeHTMLParams::Pdf,
|
|
9
9
|
timeout_ms: Integer,
|
|
10
10
|
wait_for_ms: Integer
|
|
11
11
|
}
|
|
@@ -25,9 +25,11 @@ module ContextDev
|
|
|
25
25
|
|
|
26
26
|
def max_age_ms=: (Integer) -> Integer
|
|
27
27
|
|
|
28
|
-
attr_reader
|
|
28
|
+
attr_reader pdf: ContextDev::WebWebScrapeHTMLParams::Pdf?
|
|
29
29
|
|
|
30
|
-
def
|
|
30
|
+
def pdf=: (
|
|
31
|
+
ContextDev::WebWebScrapeHTMLParams::Pdf
|
|
32
|
+
) -> ContextDev::WebWebScrapeHTMLParams::Pdf
|
|
31
33
|
|
|
32
34
|
attr_reader timeout_ms: Integer?
|
|
33
35
|
|
|
@@ -41,7 +43,7 @@ module ContextDev
|
|
|
41
43
|
url: String,
|
|
42
44
|
?include_frames: bool,
|
|
43
45
|
?max_age_ms: Integer,
|
|
44
|
-
?
|
|
46
|
+
?pdf: ContextDev::WebWebScrapeHTMLParams::Pdf,
|
|
45
47
|
?timeout_ms: Integer,
|
|
46
48
|
?wait_for_ms: Integer,
|
|
47
49
|
?request_options: ContextDev::request_opts
|
|
@@ -51,11 +53,35 @@ module ContextDev
|
|
|
51
53
|
url: String,
|
|
52
54
|
include_frames: bool,
|
|
53
55
|
max_age_ms: Integer,
|
|
54
|
-
|
|
56
|
+
pdf: ContextDev::WebWebScrapeHTMLParams::Pdf,
|
|
55
57
|
timeout_ms: Integer,
|
|
56
58
|
wait_for_ms: Integer,
|
|
57
59
|
request_options: ContextDev::RequestOptions
|
|
58
60
|
}
|
|
61
|
+
|
|
62
|
+
type pdf = { end_: Integer, should_parse: bool, start: Integer }
|
|
63
|
+
|
|
64
|
+
class Pdf < ContextDev::Internal::Type::BaseModel
|
|
65
|
+
attr_reader end_: Integer?
|
|
66
|
+
|
|
67
|
+
def end_=: (Integer) -> Integer
|
|
68
|
+
|
|
69
|
+
attr_reader should_parse: bool?
|
|
70
|
+
|
|
71
|
+
def should_parse=: (bool) -> bool
|
|
72
|
+
|
|
73
|
+
attr_reader start: Integer?
|
|
74
|
+
|
|
75
|
+
def start=: (Integer) -> Integer
|
|
76
|
+
|
|
77
|
+
def initialize: (
|
|
78
|
+
?end_: Integer,
|
|
79
|
+
?should_parse: bool,
|
|
80
|
+
?start: Integer
|
|
81
|
+
) -> void
|
|
82
|
+
|
|
83
|
+
def to_hash: -> { end_: Integer, should_parse: bool, start: Integer }
|
|
84
|
+
end
|
|
59
85
|
end
|
|
60
86
|
end
|
|
61
87
|
end
|
|
@@ -7,7 +7,7 @@ module ContextDev
|
|
|
7
7
|
include_images: bool,
|
|
8
8
|
include_links: bool,
|
|
9
9
|
max_age_ms: Integer,
|
|
10
|
-
|
|
10
|
+
pdf: ContextDev::WebWebScrapeMdParams::Pdf,
|
|
11
11
|
:shorten_base64_images => bool,
|
|
12
12
|
timeout_ms: Integer,
|
|
13
13
|
use_main_content_only: bool,
|
|
@@ -37,9 +37,11 @@ module ContextDev
|
|
|
37
37
|
|
|
38
38
|
def max_age_ms=: (Integer) -> Integer
|
|
39
39
|
|
|
40
|
-
attr_reader
|
|
40
|
+
attr_reader pdf: ContextDev::WebWebScrapeMdParams::Pdf?
|
|
41
41
|
|
|
42
|
-
def
|
|
42
|
+
def pdf=: (
|
|
43
|
+
ContextDev::WebWebScrapeMdParams::Pdf
|
|
44
|
+
) -> ContextDev::WebWebScrapeMdParams::Pdf
|
|
43
45
|
|
|
44
46
|
attr_reader shorten_base64_images: bool?
|
|
45
47
|
|
|
@@ -63,7 +65,7 @@ module ContextDev
|
|
|
63
65
|
?include_images: bool,
|
|
64
66
|
?include_links: bool,
|
|
65
67
|
?max_age_ms: Integer,
|
|
66
|
-
?
|
|
68
|
+
?pdf: ContextDev::WebWebScrapeMdParams::Pdf,
|
|
67
69
|
?shorten_base64_images: bool,
|
|
68
70
|
?timeout_ms: Integer,
|
|
69
71
|
?use_main_content_only: bool,
|
|
@@ -77,13 +79,37 @@ module ContextDev
|
|
|
77
79
|
include_images: bool,
|
|
78
80
|
include_links: bool,
|
|
79
81
|
max_age_ms: Integer,
|
|
80
|
-
|
|
82
|
+
pdf: ContextDev::WebWebScrapeMdParams::Pdf,
|
|
81
83
|
:shorten_base64_images => bool,
|
|
82
84
|
timeout_ms: Integer,
|
|
83
85
|
use_main_content_only: bool,
|
|
84
86
|
wait_for_ms: Integer,
|
|
85
87
|
request_options: ContextDev::RequestOptions
|
|
86
88
|
}
|
|
89
|
+
|
|
90
|
+
type pdf = { end_: Integer, should_parse: bool, start: Integer }
|
|
91
|
+
|
|
92
|
+
class Pdf < ContextDev::Internal::Type::BaseModel
|
|
93
|
+
attr_reader end_: Integer?
|
|
94
|
+
|
|
95
|
+
def end_=: (Integer) -> Integer
|
|
96
|
+
|
|
97
|
+
attr_reader should_parse: bool?
|
|
98
|
+
|
|
99
|
+
def should_parse=: (bool) -> bool
|
|
100
|
+
|
|
101
|
+
attr_reader start: Integer?
|
|
102
|
+
|
|
103
|
+
def start=: (Integer) -> Integer
|
|
104
|
+
|
|
105
|
+
def initialize: (
|
|
106
|
+
?end_: Integer,
|
|
107
|
+
?should_parse: bool,
|
|
108
|
+
?start: Integer
|
|
109
|
+
) -> void
|
|
110
|
+
|
|
111
|
+
def to_hash: -> { end_: Integer, should_parse: bool, start: Integer }
|
|
112
|
+
end
|
|
87
113
|
end
|
|
88
114
|
end
|
|
89
115
|
end
|
|
@@ -36,8 +36,9 @@ module ContextDev
|
|
|
36
36
|
?max_age_ms: Integer,
|
|
37
37
|
?max_depth: Integer,
|
|
38
38
|
?max_pages: Integer,
|
|
39
|
-
?
|
|
39
|
+
?pdf: ContextDev::WebWebCrawlMdParams::Pdf,
|
|
40
40
|
?shorten_base64_images: bool,
|
|
41
|
+
?stop_after_ms: Integer,
|
|
41
42
|
?timeout_ms: Integer,
|
|
42
43
|
?url_regex: String,
|
|
43
44
|
?use_main_content_only: bool,
|
|
@@ -49,7 +50,7 @@ module ContextDev
|
|
|
49
50
|
url: String,
|
|
50
51
|
?include_frames: bool,
|
|
51
52
|
?max_age_ms: Integer,
|
|
52
|
-
?
|
|
53
|
+
?pdf: ContextDev::WebWebScrapeHTMLParams::Pdf,
|
|
53
54
|
?timeout_ms: Integer,
|
|
54
55
|
?wait_for_ms: Integer,
|
|
55
56
|
?request_options: ContextDev::request_opts
|
|
@@ -70,7 +71,7 @@ module ContextDev
|
|
|
70
71
|
?include_images: bool,
|
|
71
72
|
?include_links: bool,
|
|
72
73
|
?max_age_ms: Integer,
|
|
73
|
-
?
|
|
74
|
+
?pdf: ContextDev::WebWebScrapeMdParams::Pdf,
|
|
74
75
|
?shorten_base64_images: bool,
|
|
75
76
|
?timeout_ms: Integer,
|
|
76
77
|
?use_main_content_only: bool,
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: context.dev
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.18.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Context Dev
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-10 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: cgi
|