scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
data/lib/scrapetor.rb ADDED
@@ -0,0 +1,167 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "scrapetor/version"
4
+ require "scrapetor/errors"
5
+ require "scrapetor/encoding"
6
+ require "scrapetor/cleaner"
7
+ require "scrapetor/money"
8
+ require "scrapetor/url"
9
+ require "scrapetor/fingerprint"
10
+ require "scrapetor/text_node"
11
+ require "scrapetor/comment_node"
12
+ require "scrapetor/selector"
13
+ require "scrapetor/sax"
14
+ require "scrapetor/dom"
15
+ require "scrapetor/dom/parser"
16
+ require "scrapetor/dom/selectors"
17
+ require "scrapetor/node"
18
+ require "scrapetor/node_set"
19
+ require "scrapetor/schema"
20
+ require "scrapetor/extractor"
21
+ require "scrapetor/document"
22
+ require "scrapetor/template_registry"
23
+ require "scrapetor/structured_data"
24
+ require "scrapetor/microdata"
25
+ require "scrapetor/page_type"
26
+ require "scrapetor/entities"
27
+ require "scrapetor/builder"
28
+ require "scrapetor/http"
29
+ require "scrapetor/native"
30
+ require "scrapetor/native_dom"
31
+ require "scrapetor/persistent_cache"
32
+ require "scrapetor/stream"
33
+ require "scrapetor/fetcher"
34
+ require "scrapetor/session"
35
+ require "scrapetor/robots"
36
+ require "scrapetor/sitemap"
37
+ require "scrapetor/pagination"
38
+ require "scrapetor/form"
39
+ require "scrapetor/xpath"
40
+
41
+ module Scrapetor
42
+ # ----- Parsing entry points -----
43
+
44
+ def self.parse(html, base_url: nil, build_indexes: false)
45
+ if PersistentCache.enabled? && html.is_a?(String) && !html.empty?
46
+ cached = PersistentCache.load(html)
47
+ if cached
48
+ doc = Document.new(html, base_url: base_url,
49
+ build_indexes: build_indexes, native: cached)
50
+ return doc
51
+ end
52
+ end
53
+ doc = Document.new(html, base_url: base_url, build_indexes: build_indexes)
54
+ if PersistentCache.enabled? && html.is_a?(String) && !html.empty?
55
+ PersistentCache.store(html, doc.backing.native) rescue nil
56
+ end
57
+ doc
58
+ end
59
+
60
+ # `Scrapetor::HTML(html)` — capital-H convenience method.
61
+ def self.HTML(html, base_url = nil)
62
+ parse(html, base_url: base_url)
63
+ end
64
+
65
+ def self.parse_html(html, base_url: nil)
66
+ parse(html, base_url: base_url)
67
+ end
68
+
69
+ def self.parse_fragment(html, base_url: nil)
70
+ parse(html, base_url: base_url)
71
+ end
72
+
73
+ # Parse from an arbitrary IO-like (responds to `read`) or a file path.
74
+ def self.parse_io(io, base_url: nil)
75
+ parse(io.read, base_url: base_url)
76
+ end
77
+
78
+ def self.parse_file(path, base_url: nil)
79
+ parse(File.read(path), base_url: base_url)
80
+ end
81
+
82
+ # Parse N documents in parallel via native pthread workers, releasing
83
+ # the GVL for the duration. Returns Array<Scrapetor::Document> in the
84
+ # same order as the input. Skips the in-memory parse cache (which is
85
+ # GVL-bound); use single-document Scrapetor.parse for cache-friendly
86
+ # workloads.
87
+ #
88
+ # Use this for batch jobs over distinct documents where parsing
89
+ # dominates: pre-warming a fixture corpus, indexing a crawl, A/B
90
+ # comparing parsed shapes. Falls through to a serial parse when only
91
+ # one document is provided.
92
+ def self.parallel_parse(htmls, threads: nil)
93
+ htmls = Array(htmls)
94
+ return [] if htmls.empty?
95
+ return [parse(htmls.first)] if htmls.size == 1
96
+ n = threads || default_parallel_threads(htmls.size)
97
+ natives = Native::Document.parallel_parse(htmls, n)
98
+ natives.each_with_index.map do |native, i|
99
+ Document.new(htmls[i], native: native)
100
+ end
101
+ end
102
+
103
+ def self.default_parallel_threads(n_items)
104
+ cpu = begin
105
+ require "etc"
106
+ Etc.nprocessors
107
+ rescue StandardError
108
+ 4
109
+ end
110
+ [n_items, cpu].min
111
+ end
112
+
113
+ # Run an extraction schema directly against a file or IO.
114
+ def self.extract_file(path, schema, base_url: nil)
115
+ extract(File.read(path), schema, base_url: base_url)
116
+ end
117
+
118
+ # `Scrapetor::HTML5(html)` — same parser, alternate name.
119
+ def self.HTML5(*args, &block)
120
+ parse(*args, &block)
121
+ end
122
+
123
+ # `Scrapetor::HTML.parse` / `.fragment` namespace.
124
+ module HTML
125
+ def self.parse(*args, &block)
126
+ Scrapetor.parse(*args, &block)
127
+ end
128
+
129
+ def self.fragment(*args, &block)
130
+ Scrapetor.parse_fragment(*args, &block)
131
+ end
132
+ end
133
+
134
+ module HTML5
135
+ def self.parse(*args, &block)
136
+ Scrapetor.parse(*args, &block)
137
+ end
138
+
139
+ def self.fragment(*args, &block)
140
+ Scrapetor.parse_fragment(*args, &block)
141
+ end
142
+ end
143
+
144
+ # ----- Extraction DSL -----
145
+
146
+ def self.schema(&block)
147
+ Schema.build(&block)
148
+ end
149
+
150
+ def self.extract(html, schema = nil, base_url: nil, &block)
151
+ parse(html, base_url: base_url).extract(schema, &block)
152
+ end
153
+
154
+ # Force the native streaming path. Raises if the schema can't compile.
155
+ def self.extract_native(html, schema, base_url: nil)
156
+ raise Error, "native extension not loaded" unless Native.available?
157
+ desc = Native.compile_descriptor(schema)
158
+ raise Error, "schema not native-compilable" unless desc
159
+ Native.extract(html.to_s, desc, base_url)
160
+ end
161
+
162
+ # Force the Ruby reference path. Useful for parity tests + benchmarks.
163
+ def self.extract_ruby(html, schema, base_url: nil)
164
+ doc = parse(html, base_url: base_url)
165
+ Extractor.run(doc, doc.backing, schema)
166
+ end
167
+ end
data/scrapetor.gemspec ADDED
@@ -0,0 +1,77 @@
1
+ require_relative "lib/scrapetor/version"
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "scrapetor"
5
+ spec.version = Scrapetor::VERSION
6
+ spec.authors = ["Alaa Abdulridha"]
7
+ spec.email = ["alaa@serpapi.com"]
8
+
9
+ spec.summary = "Production HTML parser + scraping toolkit. Native arena DOM, HTTP/2 fetch layer, streaming extraction."
10
+ spec.description =
11
+ "Scrapetor is a Ruby HTML parsing + scraping toolkit. The parser is a " \
12
+ "native C arena DOM with structural indexes built at parse time and " \
13
+ "NEON SIMD scanners in the SAX hot loop. A streaming extraction engine " \
14
+ "compiles the schema DSL into a single forward pass — no DOM " \
15
+ "materialised, one Ruby boundary crossing per document. " \
16
+ "On builds where libcurl is available, Scrapetor::Fetcher adds an " \
17
+ "HTTP/2-capable fetch layer with per-thread connection cache, shared " \
18
+ "DNS + TLS session pool, in-process gzip / deflate / brotli / zstd " \
19
+ "decoding, iconv charset transcoding, retry + exponential backoff, " \
20
+ "ETag / Last-Modified disk cache with bulk revalidation, per-host " \
21
+ "throttle, cookie jar, basic + bearer auth, proxy, and three bulk " \
22
+ "concurrency models (parallel_fetch / multi_fetch / streaming " \
23
+ "multi_each). Scrapetor::Session ties the cookie / auth / throttle / " \
24
+ "retry policies together. Also ships robots.txt + sitemap.xml " \
25
+ "parsers, a bounded-memory streaming HTML parser, and structured-data " \
26
+ "extractors (JSON-LD, OpenGraph, Schema.org, Microdata, RDFa, Twitter " \
27
+ "Cards). The Net::HTTP-based Scrapetor.fetch is preserved as the " \
28
+ "no-libcurl fallback."
29
+
30
+ spec.homepage = "https://scrapetor.org"
31
+ spec.license = "MIT"
32
+ spec.required_ruby_version = ">= 2.7.0"
33
+ spec.required_rubygems_version = ">= 3.0.0"
34
+
35
+ spec.metadata = {
36
+ "homepage_uri" => "http://scrapetor.org",
37
+ "source_code_uri" => "https://github.com/Alaa-abdulridha/scrapetor",
38
+ "bug_tracker_uri" => "https://github.com/Alaa-abdulridha/scrapetor/issues",
39
+ "changelog_uri" => "https://github.com/Alaa-abdulridha/scrapetor/blob/main/CHANGELOG.md",
40
+ "documentation_uri" => "http://scrapetor.org/docs",
41
+ "wiki_uri" => "https://github.com/Alaa-abdulridha/scrapetor/wiki",
42
+ "rubygems_mfa_required" => "true"
43
+ }
44
+
45
+ spec.files = Dir[
46
+ "lib/**/*.rb",
47
+ "ext/**/*.{rb,c,h}",
48
+ "ext/**/README.md",
49
+ "bin/*",
50
+ "CHANGELOG.md",
51
+ "LICENSE",
52
+ "README.md",
53
+ "scrapetor.gemspec"
54
+ ]
55
+
56
+ spec.bindir = "bin"
57
+ spec.executables = ["scrapetor", "scrapetor-bench"]
58
+ spec.require_paths = ["lib"]
59
+ spec.extensions = ["ext/scrapetor/native/extconf.rb"]
60
+
61
+ # No runtime gem dependencies. Scrapetor is self-contained: pure Ruby
62
+ # plus a single C99 extension. The extension compiles at install time
63
+ # via the standard mkmf path; only a working C compiler is required.
64
+
65
+ spec.add_development_dependency "minitest", "~> 5.0"
66
+ spec.add_development_dependency "benchmark-ips", "~> 2.0"
67
+ spec.add_development_dependency "rake", "~> 13.0"
68
+ # webrick was bundled with Ruby 2.7 / earlier; removed from stdlib
69
+ # in 3.0. The Fetcher + Session test suites spin up local HTTP
70
+ # servers via it.
71
+ spec.add_development_dependency "webrick", "~> 1.7"
72
+
73
+ # Comparison oracles used by the benchmark scripts only. Not loaded by
74
+ # production code.
75
+ spec.add_development_dependency "nokogiri", ">= 1.13"
76
+ spec.add_development_dependency "nokolexbor", ">= 0.6"
77
+ end
metadata ADDED
@@ -0,0 +1,200 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapetor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Alaa Abdulridha
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-05-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: minitest
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '5.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '5.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: benchmark-ips
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '13.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '13.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: webrick
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.7'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '1.13'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '1.13'
83
+ - !ruby/object:Gem::Dependency
84
+ name: nokolexbor
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0.6'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0.6'
97
+ description: Scrapetor is a Ruby HTML parsing + scraping toolkit. The parser is a
98
+ native C arena DOM with structural indexes built at parse time and NEON SIMD scanners
99
+ in the SAX hot loop. A streaming extraction engine compiles the schema DSL into
100
+ a single forward pass — no DOM materialised, one Ruby boundary crossing per document.
101
+ On builds where libcurl is available, Scrapetor::Fetcher adds an HTTP/2-capable
102
+ fetch layer with per-thread connection cache, shared DNS + TLS session pool, in-process
103
+ gzip / deflate / brotli / zstd decoding, iconv charset transcoding, retry + exponential
104
+ backoff, ETag / Last-Modified disk cache with bulk revalidation, per-host throttle,
105
+ cookie jar, basic + bearer auth, proxy, and three bulk concurrency models (parallel_fetch
106
+ / multi_fetch / streaming multi_each). Scrapetor::Session ties the cookie / auth
107
+ / throttle / retry policies together. Also ships robots.txt + sitemap.xml parsers,
108
+ a bounded-memory streaming HTML parser, and structured-data extractors (JSON-LD,
109
+ OpenGraph, Schema.org, Microdata, RDFa, Twitter Cards). The Net::HTTP-based Scrapetor.fetch
110
+ is preserved as the no-libcurl fallback.
111
+ email:
112
+ - alaa@serpapi.com
113
+ executables:
114
+ - scrapetor
115
+ - scrapetor-bench
116
+ extensions:
117
+ - ext/scrapetor/native/extconf.rb
118
+ extra_rdoc_files: []
119
+ files:
120
+ - CHANGELOG.md
121
+ - LICENSE
122
+ - README.md
123
+ - bin/scrapetor
124
+ - bin/scrapetor-bench
125
+ - ext/scrapetor/README.md
126
+ - ext/scrapetor/native/extconf.rb
127
+ - ext/scrapetor/native/scrapetor_dom.c
128
+ - ext/scrapetor/native/scrapetor_http.c
129
+ - ext/scrapetor/native/scrapetor_native.c
130
+ - lib/scrapetor.rb
131
+ - lib/scrapetor/builder.rb
132
+ - lib/scrapetor/cleaner.rb
133
+ - lib/scrapetor/comment_node.rb
134
+ - lib/scrapetor/document.rb
135
+ - lib/scrapetor/dom.rb
136
+ - lib/scrapetor/dom/parser.rb
137
+ - lib/scrapetor/dom/selectors.rb
138
+ - lib/scrapetor/encoding.rb
139
+ - lib/scrapetor/entities.rb
140
+ - lib/scrapetor/errors.rb
141
+ - lib/scrapetor/extractor.rb
142
+ - lib/scrapetor/fetcher.rb
143
+ - lib/scrapetor/fingerprint.rb
144
+ - lib/scrapetor/form.rb
145
+ - lib/scrapetor/http.rb
146
+ - lib/scrapetor/microdata.rb
147
+ - lib/scrapetor/money.rb
148
+ - lib/scrapetor/native.rb
149
+ - lib/scrapetor/native_dom.rb
150
+ - lib/scrapetor/node.rb
151
+ - lib/scrapetor/node_set.rb
152
+ - lib/scrapetor/page_type.rb
153
+ - lib/scrapetor/pagination.rb
154
+ - lib/scrapetor/persistent_cache.rb
155
+ - lib/scrapetor/robots.rb
156
+ - lib/scrapetor/sax.rb
157
+ - lib/scrapetor/schema.rb
158
+ - lib/scrapetor/selector.rb
159
+ - lib/scrapetor/session.rb
160
+ - lib/scrapetor/sitemap.rb
161
+ - lib/scrapetor/stream.rb
162
+ - lib/scrapetor/structured_data.rb
163
+ - lib/scrapetor/template_registry.rb
164
+ - lib/scrapetor/text_node.rb
165
+ - lib/scrapetor/url.rb
166
+ - lib/scrapetor/version.rb
167
+ - lib/scrapetor/xpath.rb
168
+ - scrapetor.gemspec
169
+ homepage: https://scrapetor.org
170
+ licenses:
171
+ - MIT
172
+ metadata:
173
+ homepage_uri: http://scrapetor.org
174
+ source_code_uri: https://github.com/Alaa-abdulridha/scrapetor
175
+ bug_tracker_uri: https://github.com/Alaa-abdulridha/scrapetor/issues
176
+ changelog_uri: https://github.com/Alaa-abdulridha/scrapetor/blob/main/CHANGELOG.md
177
+ documentation_uri: http://scrapetor.org/docs
178
+ wiki_uri: https://github.com/Alaa-abdulridha/scrapetor/wiki
179
+ rubygems_mfa_required: 'true'
180
+ post_install_message:
181
+ rdoc_options: []
182
+ require_paths:
183
+ - lib
184
+ required_ruby_version: !ruby/object:Gem::Requirement
185
+ requirements:
186
+ - - ">="
187
+ - !ruby/object:Gem::Version
188
+ version: 2.7.0
189
+ required_rubygems_version: !ruby/object:Gem::Requirement
190
+ requirements:
191
+ - - ">="
192
+ - !ruby/object:Gem::Version
193
+ version: 3.0.0
194
+ requirements: []
195
+ rubygems_version: 3.1.6
196
+ signing_key:
197
+ specification_version: 4
198
+ summary: Production HTML parser + scraping toolkit. Native arena DOM, HTTP/2 fetch
199
+ layer, streaming extraction.
200
+ test_files: []