vore 0.4.0-x86_64-darwin → 0.5.0-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7231d77600a8372eda42123a340eac3eabdd707a95f018fcc112ae77934ad50d
4
- data.tar.gz: b205e579cd2dd41dc8e816ac64f7fc40a829c4bd3ca455db670e72104ef94831
3
+ metadata.gz: 375d0bfc0c775c2be95de44e7d5a750da641f00332490deace0863b569331cb6
4
+ data.tar.gz: 9e39c8fbb2b9813e86138c76364e5de9539390d8569e76f35bdb7dcd25a18cae
5
5
  SHA512:
6
- metadata.gz: bae9647133f39ea1ab8413991434ea2433e98433a2dcb386793edbbc2995c606338a4e12a7b83cb7b69f390d25a7bbdb8e9228403b3742b7b9d1a374e6d5fa61
7
- data.tar.gz: 24461a245a4498dd4c1a2ef3d374510604186b64a57616d9a6d95aa9dd16a7ed95bb98cd336b5ec82c78ae90cbb44449c6731f6f905ffeb864b25a7b1075d4a3
6
+ metadata.gz: d8ed087d41652d299c514290f16dcf88e65f38d2c719dc3dc914b8ad1f9cd6674aca09053310b1cd08aeed256428fd98cabe2e9cdc5c93e0fa27838393c04759
7
+ data.tar.gz: 94cc7f34e9c8cccbf85e24b1ad38dd17e91fb0f736838422c61a09532aefa323973cc410c3f72d52939b6e75d909f703ab8c93e2b4da3195721102cedd4760e1
data/Cargo.lock CHANGED
@@ -238,7 +238,7 @@ version = "0.2.3"
238
238
  source = "registry+https://github.com/rust-lang/crates.io-index"
239
239
  checksum = "fc229be27b394115abdc89e09500d5030407734d21a143a833eae5f136821bcd"
240
240
  dependencies = [
241
- "compact_str",
241
+ "compact_str 0.7.1",
242
242
  "serde",
243
243
  ]
244
244
 
@@ -346,6 +346,20 @@ dependencies = [
346
346
  "static_assertions",
347
347
  ]
348
348
 
349
+ [[package]]
350
+ name = "compact_str"
351
+ version = "0.8.0"
352
+ source = "registry+https://github.com/rust-lang/crates.io-index"
353
+ checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
354
+ dependencies = [
355
+ "castaway",
356
+ "cfg-if",
357
+ "itoa",
358
+ "rustversion",
359
+ "ryu",
360
+ "static_assertions",
361
+ ]
362
+
349
363
  [[package]]
350
364
  name = "cookie"
351
365
  version = "0.18.1"
@@ -1418,18 +1432,18 @@ dependencies = [
1418
1432
 
1419
1433
  [[package]]
1420
1434
  name = "rb-sys"
1421
- version = "0.9.98"
1435
+ version = "0.9.99"
1422
1436
  source = "registry+https://github.com/rust-lang/crates.io-index"
1423
- checksum = "8914b2e6af10bd50dd7aaac8c5146872d3924d6012929b4ff504e988f6badd24"
1437
+ checksum = "d83151cfea2b67db2444f68c53b119ff77cff235ad711c765072e4daf8f3185b"
1424
1438
  dependencies = [
1425
1439
  "rb-sys-build",
1426
1440
  ]
1427
1441
 
1428
1442
  [[package]]
1429
1443
  name = "rb-sys-build"
1430
- version = "0.9.98"
1444
+ version = "0.9.99"
1431
1445
  source = "registry+https://github.com/rust-lang/crates.io-index"
1432
- checksum = "12af68c9757d419b82d65a12b5db538990dfe9416049fea3f0ba4b9a8ca108cd"
1446
+ checksum = "32d038214c118ad4a75db555ccb78672e17e1c5c10f344456cd129008dbaa7de"
1433
1447
  dependencies = [
1434
1448
  "bindgen",
1435
1449
  "lazy_static",
@@ -1786,14 +1800,14 @@ dependencies = [
1786
1800
 
1787
1801
  [[package]]
1788
1802
  name = "spider"
1789
- version = "1.99.8"
1803
+ version = "1.99.11"
1790
1804
  source = "registry+https://github.com/rust-lang/crates.io-index"
1791
- checksum = "525670cdc6aec8f4cb91da17ce0255050e89eb7c889272216d8a4fb644d67530"
1805
+ checksum = "e23ad22d5e55b09f480f849b37dd2fe315e3cf1df0f5261209aa5482483c617f"
1792
1806
  dependencies = [
1793
1807
  "ahash",
1794
1808
  "bytes",
1795
1809
  "case_insensitive_string",
1796
- "compact_str",
1810
+ "compact_str 0.8.0",
1797
1811
  "cssparser",
1798
1812
  "ego-tree",
1799
1813
  "fast_html5ever",
@@ -1817,9 +1831,9 @@ dependencies = [
1817
1831
 
1818
1832
  [[package]]
1819
1833
  name = "spider_cli"
1820
- version = "1.99.8"
1834
+ version = "1.99.11"
1821
1835
  source = "registry+https://github.com/rust-lang/crates.io-index"
1822
- checksum = "9bd9d95178dc0715608d5f28501c8321de3e14b40046c6584a12ffce96f0a676"
1836
+ checksum = "be5da7d570871156c08025bdc13de670807d36a90be94f8aa8342a04e5268662"
1823
1837
  dependencies = [
1824
1838
  "clap",
1825
1839
  "env_logger",
data/README.md CHANGED
@@ -42,6 +42,29 @@ The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), s
42
42
  | `delete_after_yield` | Whether the downloaded HTML files are deleted after the yield block finishes. | `true` |
43
43
  | `log_level` | The logging level. | `:warn` |
44
44
 
45
+ ### Processing pages
46
+
47
+ Vore processes HTML using handlers. By default, there are two:
48
+
49
+ * The `MetaExtractor`, which extracts information from your `title` and `meta` tags
50
+ * The `TagRemover`, which removes unnecessary elements like `header`, `footer`, `script`
51
+
52
+ If you wish to process the HTML further, you can provide your own handler:
53
+
54
+
55
+ ```ruby
56
+ Vore::Crawler.new(handlers: [MySpecialHandler.new])
57
+ ```
58
+
59
+ Handlers are defined using [Selma](https://github.com/gjtorikian/selma?tab=readme-ov-file#defining-handlers). Note that the `MetaExtractor` is always included and defined first, but if you pass in anything to the `handler` array, it'll overwrite Vore's other default handlers. You can of course choose to include them manually:
60
+
61
+
62
+ ```ruby
63
+ # preserve Vore's default content handler while adding your own;
64
+ # `MetaExtractor` is prefixed to the front
65
+ Vore::Crawler.new(handlers: [Vore::Handlers::TagRemover.new, MySpecialHandler.new])
66
+ ```
67
+
45
68
  ### In tests
46
69
 
47
70
  Since the actual HTTP calls occur in a separate process, Vore will not integrate with libraries like VCR or Webmock by default. You'll need to `require "vore/minitest_helper"` to get a function that emulates the HTTP `GET` requests in a way Ruby can interpret.
data/exe/vore-spider CHANGED
Binary file
@@ -3,6 +3,7 @@
3
3
  module Vore
4
4
  class Configuration
5
5
  DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
6
+ allow_comments: false,
6
7
  allow_doctype: false,
7
8
  })
8
9
 
data/lib/vore/crawler.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "handlers/content_extractor"
3
+ require_relative "handlers/meta_extractor"
4
+ require_relative "handlers/tag_remover"
4
5
 
5
6
  require "listen"
6
7
 
@@ -10,13 +11,20 @@ module Vore
10
11
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
11
12
  FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
12
13
 
13
- attr_reader :output_dir
14
+ attr_reader :handlers, :output_dir
14
15
 
15
16
  # Creates a crawler
16
17
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
17
- def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, options: {})
18
- @content_extractor = Vore::Handlers::ContentExtractor.new
19
- @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
18
+ def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {})
19
+ @meta_extractor = Vore::Handlers::MetaExtractor.new
20
+
21
+ @handlers = if handlers.nil?
22
+ [@meta_extractor, Vore::Handlers::TagRemover.new]
23
+ else
24
+ handlers.unshift(@meta_extractor)
25
+ end
26
+
27
+ @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: @handlers)
20
28
  ext = PLATFORM.include?("windows") ? ".exe" : ""
21
29
  @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
22
30
  @options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
@@ -70,34 +78,36 @@ module Vore
70
78
  @results[:pages_visited] += 1
71
79
 
72
80
  html_file = File.read(path).force_encoding("UTF-8")
73
- rewritten_html_file = ""
74
81
 
75
82
  if html_file.empty?
76
83
  @results[:unprocessed_pages] << path
77
84
  return
78
85
  end
79
86
 
80
- begin
81
- rewritten_html_file = @selma.rewrite(html_file)
82
- rescue StandardError => e
83
- Vore.logger.warn("Error rewriting #{path}: #{e}")
84
- @results[:unprocessed_pages] << path
85
- return
86
- end
87
+ rewritten_html_file = @selma.rewrite(html_file)
88
+ return if rewritten_html_file.empty?
87
89
 
88
90
  # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
89
91
  url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")
90
92
 
91
93
  page = Vore::PageData.new(
92
94
  content: rewritten_html_file,
93
- title: @content_extractor.title,
94
- meta: @content_extractor.meta,
95
+ title: @meta_extractor.title,
96
+ meta: @meta_extractor.meta,
95
97
  path: url_path,
96
98
  )
97
99
 
98
100
  yield page
99
101
  end
100
102
 
103
+ def rewrite(html_file)
104
+ @selma.rewrite(html_file)
105
+ rescue StandardError => e
106
+ Vore.logger.warn("Error rewriting #{path}: #{e}")
107
+ @results[:unprocessed_pages] << path
108
+ ""
109
+ end
110
+
101
111
  def run_command(website, delay: 0)
102
112
  pid = Process.spawn(
103
113
  @executable,
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Vore
4
4
  module Handlers
5
- class ContentExtractor
5
+ class MetaExtractor
6
6
  SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
7
7
 
8
8
  attr_reader :title, :meta
@@ -19,22 +19,14 @@ module Vore
19
19
  end
20
20
 
21
21
  def handle_element(element)
22
- if element.tag_name == "pre" ||
23
- element.tag_name == "form" ||
24
- element.tag_name == "style" ||
25
- element.tag_name == "noscript" ||
26
- element.tag_name == "script" ||
27
- element.tag_name == "svg"
28
- element.remove
29
- elsif element.tag_name == "title"
22
+ if element.tag_name == "title"
30
23
  @within_title = true
24
+
31
25
  element.remove
32
26
  elsif element.tag_name == "meta"
33
27
  return if element.attributes["name"].nil?
34
28
 
35
29
  @meta[element.attributes["name"]] = element.attributes["content"]
36
- else
37
- element.remove_and_keep_content
38
30
  end
39
31
  end
40
32
 
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ module Handlers
5
+ class TagRemover
6
+ SELECTOR = Selma::Selector.new(match_element: "*")
7
+
8
+ def selector
9
+ SELECTOR
10
+ end
11
+
12
+ UNNECESSARY_TAGS = [
13
+ # Remove code elements
14
+ "pre",
15
+
16
+ # Remove unnecessary elements
17
+ "head",
18
+
19
+ "form",
20
+ "style",
21
+ "noscript",
22
+ "script",
23
+ "svg",
24
+
25
+ # Remove unnecessary nav elements
26
+ "header",
27
+ "footer",
28
+ "nav",
29
+ "aside",
30
+ ]
31
+
32
+ CONTENT_TO_KEEP = [
33
+ "html",
34
+ "body",
35
+ ]
36
+
37
+ def handle_element(element)
38
+ if UNNECESSARY_TAGS.include?(element.tag_name)
39
+ element.remove
40
+ elsif CONTENT_TO_KEEP.include?(element.tag_name)
41
+ element.remove_and_keep_content
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.4.0"
4
+ VERSION = "0.5.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-22 00:00:00.000000000 Z
11
+ date: 2024-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: listen
@@ -60,7 +60,8 @@ files:
60
60
  - lib/vore.rb
61
61
  - lib/vore/configuration.rb
62
62
  - lib/vore/crawler.rb
63
- - lib/vore/handlers/content_extractor.rb
63
+ - lib/vore/handlers/meta_extractor.rb
64
+ - lib/vore/handlers/tag_remover.rb
64
65
  - lib/vore/logger.rb
65
66
  - lib/vore/minitest_helper.rb
66
67
  - lib/vore/page.rb