context.dev 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/lib/context_dev/models/web_web_crawl_md_params.rb +11 -1
- data/lib/context_dev/models/web_web_crawl_md_response.rb +12 -1
- data/lib/context_dev/models/web_web_scrape_html_params.rb +11 -1
- data/lib/context_dev/models/web_web_scrape_md_params.rb +11 -1
- data/lib/context_dev/resources/web.rb +11 -4
- data/lib/context_dev/version.rb +1 -1
- data/rbi/context_dev/models/web_web_crawl_md_params.rbi +15 -0
- data/rbi/context_dev/models/web_web_crawl_md_response.rbi +8 -0
- data/rbi/context_dev/models/web_web_scrape_html_params.rbi +15 -0
- data/rbi/context_dev/models/web_web_scrape_md_params.rbi +15 -0
- data/rbi/context_dev/resources/web.rbi +15 -0
- data/sig/context_dev/models/web_web_crawl_md_params.rbs +7 -0
- data/sig/context_dev/models/web_web_crawl_md_response.rbs +5 -0
- data/sig/context_dev/models/web_web_scrape_html_params.rbs +7 -1
- data/sig/context_dev/models/web_web_scrape_md_params.rbs +7 -0
- data/sig/context_dev/resources/web.rbs +3 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cdcc91e409d0287f98b023ccbf680145a44d7c7ac9dffbf7be5fd1251fdbcc97
|
|
4
|
+
data.tar.gz: f5b48e008a3ed93e386513f9fcd8d215888ce5f8adfbc909cac3adcc5b313a27
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 37ab631ac032f9e15ff462159936d5a3b0e572890552853c4a22603530ae9f56cece10ee3e67a4a4a6e46f7b5f3bbbadcd6beebd2d8bf183243206e97e4f812c
|
|
7
|
+
data.tar.gz: 80757fb5ae2d334ec0716246dfd4702754fb442f6bb2b367b41419cd4f6274d6971b6c5ca7eeb5ff18972eda0c6d2bca9d2f609222e44c97627e1d5f3cc22777
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 1.7.0 (2026-04-24)
|
|
4
|
+
|
|
5
|
+
Full Changelog: [v1.6.0...v1.7.0](https://github.com/context-dot-dev/context-ruby-sdk/compare/v1.6.0...v1.7.0)
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
* **api:** api update ([9c8b4d9](https://github.com/context-dot-dev/context-ruby-sdk/commit/9c8b4d9b1d813c5b7992998a3fa23cde63fe2f4c))
|
|
10
|
+
|
|
3
11
|
## 1.6.0 (2026-04-23)
|
|
4
12
|
|
|
5
13
|
Full Changelog: [v1.5.0...v1.6.0](https://github.com/context-dot-dev/context-ruby-sdk/compare/v1.5.0...v1.6.0)
|
data/README.md
CHANGED
|
@@ -53,6 +53,14 @@ module ContextDev
|
|
|
53
53
|
# @return [Integer, nil]
|
|
54
54
|
optional :max_pages, Integer, api_name: :maxPages
|
|
55
55
|
|
|
56
|
+
# @!attribute parse_pdf
|
|
57
|
+
# When true (default), PDF pages are fetched and their text layer is extracted and
|
|
58
|
+
# converted to Markdown alongside HTML pages. When false, PDF pages are skipped
|
|
59
|
+
# entirely (not included in results and not counted as failures).
|
|
60
|
+
#
|
|
61
|
+
# @return [Boolean, nil]
|
|
62
|
+
optional :parse_pdf, ContextDev::Internal::Type::Boolean, api_name: :parsePDF
|
|
63
|
+
|
|
56
64
|
# @!attribute shorten_base64_images
|
|
57
65
|
# Truncate base64-encoded image data in the Markdown output
|
|
58
66
|
#
|
|
@@ -72,7 +80,7 @@ module ContextDev
|
|
|
72
80
|
# @return [Boolean, nil]
|
|
73
81
|
optional :use_main_content_only, ContextDev::Internal::Type::Boolean, api_name: :useMainContentOnly
|
|
74
82
|
|
|
75
|
-
# @!method initialize(url:, follow_subdomains: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil, shorten_base64_images: nil, url_regex: nil, use_main_content_only: nil, request_options: {})
|
|
83
|
+
# @!method initialize(url:, follow_subdomains: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil, parse_pdf: nil, shorten_base64_images: nil, url_regex: nil, use_main_content_only: nil, request_options: {})
|
|
76
84
|
# Some parameter documentations has been truncated, see
|
|
77
85
|
# {ContextDev::Models::WebWebCrawlMdParams} for more details.
|
|
78
86
|
#
|
|
@@ -90,6 +98,8 @@ module ContextDev
|
|
|
90
98
|
#
|
|
91
99
|
# @param max_pages [Integer] Maximum number of pages to crawl. Hard cap: 500.
|
|
92
100
|
#
|
|
101
|
+
# @param parse_pdf [Boolean] When true (default), PDF pages are fetched and their text layer is extracted and
|
|
102
|
+
#
|
|
93
103
|
# @param shorten_base64_images [Boolean] Truncate base64-encoded image data in the Markdown output
|
|
94
104
|
#
|
|
95
105
|
# @param url_regex [String] Regex pattern. Only URLs matching this pattern will be followed and scraped.
|
|
@@ -33,6 +33,12 @@ module ContextDev
|
|
|
33
33
|
# @return [Integer]
|
|
34
34
|
required :num_failed, Integer, api_name: :numFailed
|
|
35
35
|
|
|
36
|
+
# @!attribute num_skipped
|
|
37
|
+
# Number of URLs skipped (PDFs when parsePDF=false, or URLs not matching urlRegex)
|
|
38
|
+
#
|
|
39
|
+
# @return [Integer]
|
|
40
|
+
required :num_skipped, Integer, api_name: :numSkipped
|
|
41
|
+
|
|
36
42
|
# @!attribute num_succeeded
|
|
37
43
|
# Number of pages successfully crawled
|
|
38
44
|
#
|
|
@@ -45,11 +51,16 @@ module ContextDev
|
|
|
45
51
|
# @return [Integer]
|
|
46
52
|
required :num_urls, Integer, api_name: :numUrls
|
|
47
53
|
|
|
48
|
-
# @!method initialize(max_crawl_depth:, num_failed:, num_succeeded:, num_urls:)
|
|
54
|
+
# @!method initialize(max_crawl_depth:, num_failed:, num_skipped:, num_succeeded:, num_urls:)
|
|
55
|
+
# Some parameter documentations has been truncated, see
|
|
56
|
+
# {ContextDev::Models::WebWebCrawlMdResponse::Metadata} for more details.
|
|
57
|
+
#
|
|
49
58
|
# @param max_crawl_depth [Integer] Maximum crawl depth reached during the crawl
|
|
50
59
|
#
|
|
51
60
|
# @param num_failed [Integer] Number of pages that failed to crawl
|
|
52
61
|
#
|
|
62
|
+
# @param num_skipped [Integer] Number of URLs skipped (PDFs when parsePDF=false, or URLs not matching urlRegex)
|
|
63
|
+
#
|
|
53
64
|
# @param num_succeeded [Integer] Number of pages successfully crawled
|
|
54
65
|
#
|
|
55
66
|
# @param num_urls [Integer] Total number of URLs crawled
|
|
@@ -21,7 +21,15 @@ module ContextDev
|
|
|
21
21
|
# @return [Integer, nil]
|
|
22
22
|
optional :max_age_ms, Integer
|
|
23
23
|
|
|
24
|
-
# @!
|
|
24
|
+
# @!attribute parse_pdf
|
|
25
|
+
# When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
26
|
+
# returned wrapped in <html><pdf>…</pdf></html>. When false, PDF URLs are skipped
|
|
27
|
+
# and a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
28
|
+
#
|
|
29
|
+
# @return [Boolean, nil]
|
|
30
|
+
optional :parse_pdf, ContextDev::Internal::Type::Boolean
|
|
31
|
+
|
|
32
|
+
# @!method initialize(url:, max_age_ms: nil, parse_pdf: nil, request_options: {})
|
|
25
33
|
# Some parameter documentations has been truncated, see
|
|
26
34
|
# {ContextDev::Models::WebWebScrapeHTMLParams} for more details.
|
|
27
35
|
#
|
|
@@ -29,6 +37,8 @@ module ContextDev
|
|
|
29
37
|
#
|
|
30
38
|
# @param max_age_ms [Integer] Return a cached result if a prior scrape for the same parameters exists and is y
|
|
31
39
|
#
|
|
40
|
+
# @param parse_pdf [Boolean] When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
41
|
+
#
|
|
32
42
|
# @param request_options [ContextDev::RequestOptions, Hash{Symbol=>Object}]
|
|
33
43
|
end
|
|
34
44
|
end
|
|
@@ -34,6 +34,14 @@ module ContextDev
|
|
|
34
34
|
# @return [Integer, nil]
|
|
35
35
|
optional :max_age_ms, Integer
|
|
36
36
|
|
|
37
|
+
# @!attribute parse_pdf
|
|
38
|
+
# When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
39
|
+
# converted to Markdown. When false, PDF URLs are skipped and a 400
|
|
40
|
+
# WEBSITE_ACCESS_ERROR is returned.
|
|
41
|
+
#
|
|
42
|
+
# @return [Boolean, nil]
|
|
43
|
+
optional :parse_pdf, ContextDev::Internal::Type::Boolean
|
|
44
|
+
|
|
37
45
|
# @!attribute shorten_base64_images
|
|
38
46
|
# Shorten base64-encoded image data in the Markdown output
|
|
39
47
|
#
|
|
@@ -47,7 +55,7 @@ module ContextDev
|
|
|
47
55
|
# @return [Boolean, nil]
|
|
48
56
|
optional :use_main_content_only, ContextDev::Internal::Type::Boolean
|
|
49
57
|
|
|
50
|
-
# @!method initialize(url:, include_images: nil, include_links: nil, max_age_ms: nil, shorten_base64_images: nil, use_main_content_only: nil, request_options: {})
|
|
58
|
+
# @!method initialize(url:, include_images: nil, include_links: nil, max_age_ms: nil, parse_pdf: nil, shorten_base64_images: nil, use_main_content_only: nil, request_options: {})
|
|
51
59
|
# Some parameter documentations has been truncated, see
|
|
52
60
|
# {ContextDev::Models::WebWebScrapeMdParams} for more details.
|
|
53
61
|
#
|
|
@@ -59,6 +67,8 @@ module ContextDev
|
|
|
59
67
|
#
|
|
60
68
|
# @param max_age_ms [Integer] Return a cached result if a prior scrape for the same parameters exists and is y
|
|
61
69
|
#
|
|
70
|
+
# @param parse_pdf [Boolean] When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
71
|
+
#
|
|
62
72
|
# @param shorten_base64_images [Boolean] Shorten base64-encoded image data in the Markdown output
|
|
63
73
|
#
|
|
64
74
|
# @param use_main_content_only [Boolean] Extract only the main content of the page, excluding headers, footers, sidebars,
|
|
@@ -105,7 +105,7 @@ module ContextDev
|
|
|
105
105
|
# Performs a crawl starting from a given URL, extracts page content as Markdown,
|
|
106
106
|
# and returns results for all crawled pages.
|
|
107
107
|
#
|
|
108
|
-
# @overload web_crawl_md(url:, follow_subdomains: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil, shorten_base64_images: nil, url_regex: nil, use_main_content_only: nil, request_options: {})
|
|
108
|
+
# @overload web_crawl_md(url:, follow_subdomains: nil, include_images: nil, include_links: nil, max_age_ms: nil, max_depth: nil, max_pages: nil, parse_pdf: nil, shorten_base64_images: nil, url_regex: nil, use_main_content_only: nil, request_options: {})
|
|
109
109
|
#
|
|
110
110
|
# @param url [String] The starting URL for the crawl (must include http:// or https:// protocol)
|
|
111
111
|
#
|
|
@@ -121,6 +121,8 @@ module ContextDev
|
|
|
121
121
|
#
|
|
122
122
|
# @param max_pages [Integer] Maximum number of pages to crawl. Hard cap: 500.
|
|
123
123
|
#
|
|
124
|
+
# @param parse_pdf [Boolean] When true (default), PDF pages are fetched and their text layer is extracted and
|
|
125
|
+
#
|
|
124
126
|
# @param shorten_base64_images [Boolean] Truncate base64-encoded image data in the Markdown output
|
|
125
127
|
#
|
|
126
128
|
# @param url_regex [String] Regex pattern. Only URLs matching this pattern will be followed and scraped.
|
|
@@ -148,12 +150,14 @@ module ContextDev
|
|
|
148
150
|
#
|
|
149
151
|
# Scrapes the given URL and returns the raw HTML content of the page.
|
|
150
152
|
#
|
|
151
|
-
# @overload web_scrape_html(url:, max_age_ms: nil, request_options: {})
|
|
153
|
+
# @overload web_scrape_html(url:, max_age_ms: nil, parse_pdf: nil, request_options: {})
|
|
152
154
|
#
|
|
153
155
|
# @param url [String] Full URL to scrape (must include http:// or https:// protocol)
|
|
154
156
|
#
|
|
155
157
|
# @param max_age_ms [Integer] Return a cached result if a prior scrape for the same parameters exists and is y
|
|
156
158
|
#
|
|
159
|
+
# @param parse_pdf [Boolean] When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
160
|
+
#
|
|
157
161
|
# @param request_options [ContextDev::RequestOptions, Hash{Symbol=>Object}, nil]
|
|
158
162
|
#
|
|
159
163
|
# @return [ContextDev::Models::WebWebScrapeHTMLResponse]
|
|
@@ -165,7 +169,7 @@ module ContextDev
|
|
|
165
169
|
@client.request(
|
|
166
170
|
method: :get,
|
|
167
171
|
path: "web/scrape/html",
|
|
168
|
-
query: query.transform_keys(max_age_ms: "maxAgeMs"),
|
|
172
|
+
query: query.transform_keys(max_age_ms: "maxAgeMs", parse_pdf: "parsePDF"),
|
|
169
173
|
model: ContextDev::Models::WebWebScrapeHTMLResponse,
|
|
170
174
|
options: options
|
|
171
175
|
)
|
|
@@ -201,7 +205,7 @@ module ContextDev
|
|
|
201
205
|
#
|
|
202
206
|
# Scrapes the given URL into LLM usable Markdown.
|
|
203
207
|
#
|
|
204
|
-
# @overload web_scrape_md(url:, include_images: nil, include_links: nil, max_age_ms: nil, shorten_base64_images: nil, use_main_content_only: nil, request_options: {})
|
|
208
|
+
# @overload web_scrape_md(url:, include_images: nil, include_links: nil, max_age_ms: nil, parse_pdf: nil, shorten_base64_images: nil, use_main_content_only: nil, request_options: {})
|
|
205
209
|
#
|
|
206
210
|
# @param url [String] Full URL to scrape into LLM usable Markdown (must include http:// or https:// pr
|
|
207
211
|
#
|
|
@@ -211,6 +215,8 @@ module ContextDev
|
|
|
211
215
|
#
|
|
212
216
|
# @param max_age_ms [Integer] Return a cached result if a prior scrape for the same parameters exists and is y
|
|
213
217
|
#
|
|
218
|
+
# @param parse_pdf [Boolean] When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
219
|
+
#
|
|
214
220
|
# @param shorten_base64_images [Boolean] Shorten base64-encoded image data in the Markdown output
|
|
215
221
|
#
|
|
216
222
|
# @param use_main_content_only [Boolean] Extract only the main content of the page, excluding headers, footers, sidebars,
|
|
@@ -230,6 +236,7 @@ module ContextDev
|
|
|
230
236
|
include_images: "includeImages",
|
|
231
237
|
include_links: "includeLinks",
|
|
232
238
|
max_age_ms: "maxAgeMs",
|
|
239
|
+
parse_pdf: "parsePDF",
|
|
233
240
|
shorten_base64_images: "shortenBase64Images",
|
|
234
241
|
use_main_content_only: "useMainContentOnly"
|
|
235
242
|
),
|
data/lib/context_dev/version.rb
CHANGED
|
@@ -61,6 +61,15 @@ module ContextDev
|
|
|
61
61
|
sig { params(max_pages: Integer).void }
|
|
62
62
|
attr_writer :max_pages
|
|
63
63
|
|
|
64
|
+
# When true (default), PDF pages are fetched and their text layer is extracted and
|
|
65
|
+
# converted to Markdown alongside HTML pages. When false, PDF pages are skipped
|
|
66
|
+
# entirely (not included in results and not counted as failures).
|
|
67
|
+
sig { returns(T.nilable(T::Boolean)) }
|
|
68
|
+
attr_reader :parse_pdf
|
|
69
|
+
|
|
70
|
+
sig { params(parse_pdf: T::Boolean).void }
|
|
71
|
+
attr_writer :parse_pdf
|
|
72
|
+
|
|
64
73
|
# Truncate base64-encoded image data in the Markdown output
|
|
65
74
|
sig { returns(T.nilable(T::Boolean)) }
|
|
66
75
|
attr_reader :shorten_base64_images
|
|
@@ -92,6 +101,7 @@ module ContextDev
|
|
|
92
101
|
max_age_ms: Integer,
|
|
93
102
|
max_depth: Integer,
|
|
94
103
|
max_pages: Integer,
|
|
104
|
+
parse_pdf: T::Boolean,
|
|
95
105
|
shorten_base64_images: T::Boolean,
|
|
96
106
|
url_regex: String,
|
|
97
107
|
use_main_content_only: T::Boolean,
|
|
@@ -117,6 +127,10 @@ module ContextDev
|
|
|
117
127
|
max_depth: nil,
|
|
118
128
|
# Maximum number of pages to crawl. Hard cap: 500.
|
|
119
129
|
max_pages: nil,
|
|
130
|
+
# When true (default), PDF pages are fetched and their text layer is extracted and
|
|
131
|
+
# converted to Markdown alongside HTML pages. When false, PDF pages are skipped
|
|
132
|
+
# entirely (not included in results and not counted as failures).
|
|
133
|
+
parse_pdf: nil,
|
|
120
134
|
# Truncate base64-encoded image data in the Markdown output
|
|
121
135
|
shorten_base64_images: nil,
|
|
122
136
|
# Regex pattern. Only URLs matching this pattern will be followed and scraped.
|
|
@@ -138,6 +152,7 @@ module ContextDev
|
|
|
138
152
|
max_age_ms: Integer,
|
|
139
153
|
max_depth: Integer,
|
|
140
154
|
max_pages: Integer,
|
|
155
|
+
parse_pdf: T::Boolean,
|
|
141
156
|
shorten_base64_images: T::Boolean,
|
|
142
157
|
url_regex: String,
|
|
143
158
|
use_main_content_only: T::Boolean,
|
|
@@ -64,6 +64,10 @@ module ContextDev
|
|
|
64
64
|
sig { returns(Integer) }
|
|
65
65
|
attr_accessor :num_failed
|
|
66
66
|
|
|
67
|
+
# Number of URLs skipped (PDFs when parsePDF=false, or URLs not matching urlRegex)
|
|
68
|
+
sig { returns(Integer) }
|
|
69
|
+
attr_accessor :num_skipped
|
|
70
|
+
|
|
67
71
|
# Number of pages successfully crawled
|
|
68
72
|
sig { returns(Integer) }
|
|
69
73
|
attr_accessor :num_succeeded
|
|
@@ -76,6 +80,7 @@ module ContextDev
|
|
|
76
80
|
params(
|
|
77
81
|
max_crawl_depth: Integer,
|
|
78
82
|
num_failed: Integer,
|
|
83
|
+
num_skipped: Integer,
|
|
79
84
|
num_succeeded: Integer,
|
|
80
85
|
num_urls: Integer
|
|
81
86
|
).returns(T.attached_class)
|
|
@@ -85,6 +90,8 @@ module ContextDev
|
|
|
85
90
|
max_crawl_depth:,
|
|
86
91
|
# Number of pages that failed to crawl
|
|
87
92
|
num_failed:,
|
|
93
|
+
# Number of URLs skipped (PDFs when parsePDF=false, or URLs not matching urlRegex)
|
|
94
|
+
num_skipped:,
|
|
88
95
|
# Number of pages successfully crawled
|
|
89
96
|
num_succeeded:,
|
|
90
97
|
# Total number of URLs crawled
|
|
@@ -97,6 +104,7 @@ module ContextDev
|
|
|
97
104
|
{
|
|
98
105
|
max_crawl_depth: Integer,
|
|
99
106
|
num_failed: Integer,
|
|
107
|
+
num_skipped: Integer,
|
|
100
108
|
num_succeeded: Integer,
|
|
101
109
|
num_urls: Integer
|
|
102
110
|
}
|
|
@@ -27,10 +27,20 @@ module ContextDev
|
|
|
27
27
|
sig { params(max_age_ms: Integer).void }
|
|
28
28
|
attr_writer :max_age_ms
|
|
29
29
|
|
|
30
|
+
# When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
31
|
+
# returned wrapped in <html><pdf>…</pdf></html>. When false, PDF URLs are skipped
|
|
32
|
+
# and a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
33
|
+
sig { returns(T.nilable(T::Boolean)) }
|
|
34
|
+
attr_reader :parse_pdf
|
|
35
|
+
|
|
36
|
+
sig { params(parse_pdf: T::Boolean).void }
|
|
37
|
+
attr_writer :parse_pdf
|
|
38
|
+
|
|
30
39
|
sig do
|
|
31
40
|
params(
|
|
32
41
|
url: String,
|
|
33
42
|
max_age_ms: Integer,
|
|
43
|
+
parse_pdf: T::Boolean,
|
|
34
44
|
request_options: ContextDev::RequestOptions::OrHash
|
|
35
45
|
).returns(T.attached_class)
|
|
36
46
|
end
|
|
@@ -41,6 +51,10 @@ module ContextDev
|
|
|
41
51
|
# younger than this many milliseconds. Defaults to 1 day (86400000 ms) when
|
|
42
52
|
# omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.
|
|
43
53
|
max_age_ms: nil,
|
|
54
|
+
# When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
55
|
+
# returned wrapped in <html><pdf>…</pdf></html>. When false, PDF URLs are skipped
|
|
56
|
+
# and a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
57
|
+
parse_pdf: nil,
|
|
44
58
|
request_options: {}
|
|
45
59
|
)
|
|
46
60
|
end
|
|
@@ -50,6 +64,7 @@ module ContextDev
|
|
|
50
64
|
{
|
|
51
65
|
url: String,
|
|
52
66
|
max_age_ms: Integer,
|
|
67
|
+
parse_pdf: T::Boolean,
|
|
53
68
|
request_options: ContextDev::RequestOptions
|
|
54
69
|
}
|
|
55
70
|
)
|
|
@@ -39,6 +39,15 @@ module ContextDev
|
|
|
39
39
|
sig { params(max_age_ms: Integer).void }
|
|
40
40
|
attr_writer :max_age_ms
|
|
41
41
|
|
|
42
|
+
# When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
43
|
+
# converted to Markdown. When false, PDF URLs are skipped and a 400
|
|
44
|
+
# WEBSITE_ACCESS_ERROR is returned.
|
|
45
|
+
sig { returns(T.nilable(T::Boolean)) }
|
|
46
|
+
attr_reader :parse_pdf
|
|
47
|
+
|
|
48
|
+
sig { params(parse_pdf: T::Boolean).void }
|
|
49
|
+
attr_writer :parse_pdf
|
|
50
|
+
|
|
42
51
|
# Shorten base64-encoded image data in the Markdown output
|
|
43
52
|
sig { returns(T.nilable(T::Boolean)) }
|
|
44
53
|
attr_reader :shorten_base64_images
|
|
@@ -60,6 +69,7 @@ module ContextDev
|
|
|
60
69
|
include_images: T::Boolean,
|
|
61
70
|
include_links: T::Boolean,
|
|
62
71
|
max_age_ms: Integer,
|
|
72
|
+
parse_pdf: T::Boolean,
|
|
63
73
|
shorten_base64_images: T::Boolean,
|
|
64
74
|
use_main_content_only: T::Boolean,
|
|
65
75
|
request_options: ContextDev::RequestOptions::OrHash
|
|
@@ -77,6 +87,10 @@ module ContextDev
|
|
|
77
87
|
# younger than this many milliseconds. Defaults to 1 day (86400000 ms) when
|
|
78
88
|
# omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.
|
|
79
89
|
max_age_ms: nil,
|
|
90
|
+
# When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
91
|
+
# converted to Markdown. When false, PDF URLs are skipped and a 400
|
|
92
|
+
# WEBSITE_ACCESS_ERROR is returned.
|
|
93
|
+
parse_pdf: nil,
|
|
80
94
|
# Shorten base64-encoded image data in the Markdown output
|
|
81
95
|
shorten_base64_images: nil,
|
|
82
96
|
# Extract only the main content of the page, excluding headers, footers, sidebars,
|
|
@@ -93,6 +107,7 @@ module ContextDev
|
|
|
93
107
|
include_images: T::Boolean,
|
|
94
108
|
include_links: T::Boolean,
|
|
95
109
|
max_age_ms: Integer,
|
|
110
|
+
parse_pdf: T::Boolean,
|
|
96
111
|
shorten_base64_images: T::Boolean,
|
|
97
112
|
use_main_content_only: T::Boolean,
|
|
98
113
|
request_options: ContextDev::RequestOptions
|
|
@@ -108,6 +108,7 @@ module ContextDev
|
|
|
108
108
|
max_age_ms: Integer,
|
|
109
109
|
max_depth: Integer,
|
|
110
110
|
max_pages: Integer,
|
|
111
|
+
parse_pdf: T::Boolean,
|
|
111
112
|
shorten_base64_images: T::Boolean,
|
|
112
113
|
url_regex: String,
|
|
113
114
|
use_main_content_only: T::Boolean,
|
|
@@ -133,6 +134,10 @@ module ContextDev
|
|
|
133
134
|
max_depth: nil,
|
|
134
135
|
# Maximum number of pages to crawl. Hard cap: 500.
|
|
135
136
|
max_pages: nil,
|
|
137
|
+
# When true (default), PDF pages are fetched and their text layer is extracted and
|
|
138
|
+
# converted to Markdown alongside HTML pages. When false, PDF pages are skipped
|
|
139
|
+
# entirely (not included in results and not counted as failures).
|
|
140
|
+
parse_pdf: nil,
|
|
136
141
|
# Truncate base64-encoded image data in the Markdown output
|
|
137
142
|
shorten_base64_images: nil,
|
|
138
143
|
# Regex pattern. Only URLs matching this pattern will be followed and scraped.
|
|
@@ -149,6 +154,7 @@ module ContextDev
|
|
|
149
154
|
params(
|
|
150
155
|
url: String,
|
|
151
156
|
max_age_ms: Integer,
|
|
157
|
+
parse_pdf: T::Boolean,
|
|
152
158
|
request_options: ContextDev::RequestOptions::OrHash
|
|
153
159
|
).returns(ContextDev::Models::WebWebScrapeHTMLResponse)
|
|
154
160
|
end
|
|
@@ -159,6 +165,10 @@ module ContextDev
|
|
|
159
165
|
# younger than this many milliseconds. Defaults to 1 day (86400000 ms) when
|
|
160
166
|
# omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.
|
|
161
167
|
max_age_ms: nil,
|
|
168
|
+
# When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
169
|
+
# returned wrapped in <html><pdf>…</pdf></html>. When false, PDF URLs are skipped
|
|
170
|
+
# and a 400 WEBSITE_ACCESS_ERROR is returned.
|
|
171
|
+
parse_pdf: nil,
|
|
162
172
|
request_options: {}
|
|
163
173
|
)
|
|
164
174
|
end
|
|
@@ -186,6 +196,7 @@ module ContextDev
|
|
|
186
196
|
include_images: T::Boolean,
|
|
187
197
|
include_links: T::Boolean,
|
|
188
198
|
max_age_ms: Integer,
|
|
199
|
+
parse_pdf: T::Boolean,
|
|
189
200
|
shorten_base64_images: T::Boolean,
|
|
190
201
|
use_main_content_only: T::Boolean,
|
|
191
202
|
request_options: ContextDev::RequestOptions::OrHash
|
|
@@ -203,6 +214,10 @@ module ContextDev
|
|
|
203
214
|
# younger than this many milliseconds. Defaults to 1 day (86400000 ms) when
|
|
204
215
|
# omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.
|
|
205
216
|
max_age_ms: nil,
|
|
217
|
+
# When true (default), PDF URLs are fetched and their text layer is extracted and
|
|
218
|
+
# converted to Markdown. When false, PDF URLs are skipped and a 400
|
|
219
|
+
# WEBSITE_ACCESS_ERROR is returned.
|
|
220
|
+
parse_pdf: nil,
|
|
206
221
|
# Shorten base64-encoded image data in the Markdown output
|
|
207
222
|
shorten_base64_images: nil,
|
|
208
223
|
# Extract only the main content of the page, excluding headers, footers, sidebars,
|
|
@@ -9,6 +9,7 @@ module ContextDev
|
|
|
9
9
|
max_age_ms: Integer,
|
|
10
10
|
max_depth: Integer,
|
|
11
11
|
max_pages: Integer,
|
|
12
|
+
parse_pdf: bool,
|
|
12
13
|
:shorten_base64_images => bool,
|
|
13
14
|
url_regex: String,
|
|
14
15
|
use_main_content_only: bool
|
|
@@ -45,6 +46,10 @@ module ContextDev
|
|
|
45
46
|
|
|
46
47
|
def max_pages=: (Integer) -> Integer
|
|
47
48
|
|
|
49
|
+
attr_reader parse_pdf: bool?
|
|
50
|
+
|
|
51
|
+
def parse_pdf=: (bool) -> bool
|
|
52
|
+
|
|
48
53
|
attr_reader shorten_base64_images: bool?
|
|
49
54
|
|
|
50
55
|
def shorten_base64_images=: (bool) -> bool
|
|
@@ -65,6 +70,7 @@ module ContextDev
|
|
|
65
70
|
?max_age_ms: Integer,
|
|
66
71
|
?max_depth: Integer,
|
|
67
72
|
?max_pages: Integer,
|
|
73
|
+
?parse_pdf: bool,
|
|
68
74
|
?shorten_base64_images: bool,
|
|
69
75
|
?url_regex: String,
|
|
70
76
|
?use_main_content_only: bool,
|
|
@@ -79,6 +85,7 @@ module ContextDev
|
|
|
79
85
|
max_age_ms: Integer,
|
|
80
86
|
max_depth: Integer,
|
|
81
87
|
max_pages: Integer,
|
|
88
|
+
parse_pdf: bool,
|
|
82
89
|
:shorten_base64_images => bool,
|
|
83
90
|
url_regex: String,
|
|
84
91
|
use_main_content_only: bool,
|
|
@@ -25,6 +25,7 @@ module ContextDev
|
|
|
25
25
|
{
|
|
26
26
|
max_crawl_depth: Integer,
|
|
27
27
|
num_failed: Integer,
|
|
28
|
+
num_skipped: Integer,
|
|
28
29
|
num_succeeded: Integer,
|
|
29
30
|
num_urls: Integer
|
|
30
31
|
}
|
|
@@ -34,6 +35,8 @@ module ContextDev
|
|
|
34
35
|
|
|
35
36
|
attr_accessor num_failed: Integer
|
|
36
37
|
|
|
38
|
+
attr_accessor num_skipped: Integer
|
|
39
|
+
|
|
37
40
|
attr_accessor num_succeeded: Integer
|
|
38
41
|
|
|
39
42
|
attr_accessor num_urls: Integer
|
|
@@ -41,6 +44,7 @@ module ContextDev
|
|
|
41
44
|
def initialize: (
|
|
42
45
|
max_crawl_depth: Integer,
|
|
43
46
|
num_failed: Integer,
|
|
47
|
+
num_skipped: Integer,
|
|
44
48
|
num_succeeded: Integer,
|
|
45
49
|
num_urls: Integer
|
|
46
50
|
) -> void
|
|
@@ -48,6 +52,7 @@ module ContextDev
|
|
|
48
52
|
def to_hash: -> {
|
|
49
53
|
max_crawl_depth: Integer,
|
|
50
54
|
num_failed: Integer,
|
|
55
|
+
num_skipped: Integer,
|
|
51
56
|
num_succeeded: Integer,
|
|
52
57
|
num_urls: Integer
|
|
53
58
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module ContextDev
|
|
2
2
|
module Models
|
|
3
3
|
type web_web_scrape_html_params =
|
|
4
|
-
{ url: String, max_age_ms: Integer }
|
|
4
|
+
{ url: String, max_age_ms: Integer, parse_pdf: bool }
|
|
5
5
|
& ContextDev::Internal::Type::request_parameters
|
|
6
6
|
|
|
7
7
|
class WebWebScrapeHTMLParams < ContextDev::Internal::Type::BaseModel
|
|
@@ -14,15 +14,21 @@ module ContextDev
|
|
|
14
14
|
|
|
15
15
|
def max_age_ms=: (Integer) -> Integer
|
|
16
16
|
|
|
17
|
+
attr_reader parse_pdf: bool?
|
|
18
|
+
|
|
19
|
+
def parse_pdf=: (bool) -> bool
|
|
20
|
+
|
|
17
21
|
def initialize: (
|
|
18
22
|
url: String,
|
|
19
23
|
?max_age_ms: Integer,
|
|
24
|
+
?parse_pdf: bool,
|
|
20
25
|
?request_options: ContextDev::request_opts
|
|
21
26
|
) -> void
|
|
22
27
|
|
|
23
28
|
def to_hash: -> {
|
|
24
29
|
url: String,
|
|
25
30
|
max_age_ms: Integer,
|
|
31
|
+
parse_pdf: bool,
|
|
26
32
|
request_options: ContextDev::RequestOptions
|
|
27
33
|
}
|
|
28
34
|
end
|
|
@@ -6,6 +6,7 @@ module ContextDev
|
|
|
6
6
|
include_images: bool,
|
|
7
7
|
include_links: bool,
|
|
8
8
|
max_age_ms: Integer,
|
|
9
|
+
parse_pdf: bool,
|
|
9
10
|
:shorten_base64_images => bool,
|
|
10
11
|
use_main_content_only: bool
|
|
11
12
|
}
|
|
@@ -29,6 +30,10 @@ module ContextDev
|
|
|
29
30
|
|
|
30
31
|
def max_age_ms=: (Integer) -> Integer
|
|
31
32
|
|
|
33
|
+
attr_reader parse_pdf: bool?
|
|
34
|
+
|
|
35
|
+
def parse_pdf=: (bool) -> bool
|
|
36
|
+
|
|
32
37
|
attr_reader shorten_base64_images: bool?
|
|
33
38
|
|
|
34
39
|
def shorten_base64_images=: (bool) -> bool
|
|
@@ -42,6 +47,7 @@ module ContextDev
|
|
|
42
47
|
?include_images: bool,
|
|
43
48
|
?include_links: bool,
|
|
44
49
|
?max_age_ms: Integer,
|
|
50
|
+
?parse_pdf: bool,
|
|
45
51
|
?shorten_base64_images: bool,
|
|
46
52
|
?use_main_content_only: bool,
|
|
47
53
|
?request_options: ContextDev::request_opts
|
|
@@ -52,6 +58,7 @@ module ContextDev
|
|
|
52
58
|
include_images: bool,
|
|
53
59
|
include_links: bool,
|
|
54
60
|
max_age_ms: Integer,
|
|
61
|
+
parse_pdf: bool,
|
|
55
62
|
:shorten_base64_images => bool,
|
|
56
63
|
use_main_content_only: bool,
|
|
57
64
|
request_options: ContextDev::RequestOptions
|
|
@@ -32,6 +32,7 @@ module ContextDev
|
|
|
32
32
|
?max_age_ms: Integer,
|
|
33
33
|
?max_depth: Integer,
|
|
34
34
|
?max_pages: Integer,
|
|
35
|
+
?parse_pdf: bool,
|
|
35
36
|
?shorten_base64_images: bool,
|
|
36
37
|
?url_regex: String,
|
|
37
38
|
?use_main_content_only: bool,
|
|
@@ -41,6 +42,7 @@ module ContextDev
|
|
|
41
42
|
def web_scrape_html: (
|
|
42
43
|
url: String,
|
|
43
44
|
?max_age_ms: Integer,
|
|
45
|
+
?parse_pdf: bool,
|
|
44
46
|
?request_options: ContextDev::request_opts
|
|
45
47
|
) -> ContextDev::Models::WebWebScrapeHTMLResponse
|
|
46
48
|
|
|
@@ -54,6 +56,7 @@ module ContextDev
|
|
|
54
56
|
?include_images: bool,
|
|
55
57
|
?include_links: bool,
|
|
56
58
|
?max_age_ms: Integer,
|
|
59
|
+
?parse_pdf: bool,
|
|
57
60
|
?shorten_base64_images: bool,
|
|
58
61
|
?use_main_content_only: bool,
|
|
59
62
|
?request_options: ContextDev::request_opts
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: context.dev
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Context Dev
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-24 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: cgi
|