vore 0.4.0-arm64-darwin → 0.5.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 214aba124a131e6567e5d6eeba171d3efb63d9b74f73efe19e2e63730da5f510
4
- data.tar.gz: 971b1d9b5e5adc36ba7de5b3e6451443b4b3124ef3e73b63f44c3dc96d9effa1
3
+ metadata.gz: 1f7c02f426ad954cfa86f93607fd56fddf1587d010904d36ea627dc8ca11961e
4
+ data.tar.gz: 72ab10f3463093e373c55ff1f0cbbd4016bbd4cab9bb488040f209359e8eefde
5
5
  SHA512:
6
- metadata.gz: c123316da7dfeba1c4f3b5f8c4a62411493770a3d726097ddefa23e06868a4ec7a38d853b8b973ec75ee7773383f302b671d83057a5ac0a343f801a4c90a3420
7
- data.tar.gz: 6de3f0622a91ad15f63331075ed3b79746f4d894f9521c25dd8815779212d93492c10014e47cb36aa0c6bdaa1ae8c2feb5d6bafdc19cdc792f17eccc97db0e74
6
+ metadata.gz: 5d1916963039e4faf9dab3af37289e2309a4f775a5dde0afd888fa1e4ca9aa225f044335949fdc10fe1b91e3504a3c966f4a2277706af6a0425b77c8e25792a2
7
+ data.tar.gz: 036cfc1bdf2b24052333edea4744cfd01ba5550f5dd302bb57fe0f0e6686d48cfc15a5f9532ad0c06c377d84552ab1ceb53ebaa4ca085985b92b2f4a4eaea55e
data/Cargo.lock CHANGED
@@ -238,7 +238,7 @@ version = "0.2.3"
238
238
  source = "registry+https://github.com/rust-lang/crates.io-index"
239
239
  checksum = "fc229be27b394115abdc89e09500d5030407734d21a143a833eae5f136821bcd"
240
240
  dependencies = [
241
- "compact_str",
241
+ "compact_str 0.7.1",
242
242
  "serde",
243
243
  ]
244
244
 
@@ -346,6 +346,20 @@ dependencies = [
346
346
  "static_assertions",
347
347
  ]
348
348
 
349
+ [[package]]
350
+ name = "compact_str"
351
+ version = "0.8.0"
352
+ source = "registry+https://github.com/rust-lang/crates.io-index"
353
+ checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
354
+ dependencies = [
355
+ "castaway",
356
+ "cfg-if",
357
+ "itoa",
358
+ "rustversion",
359
+ "ryu",
360
+ "static_assertions",
361
+ ]
362
+
349
363
  [[package]]
350
364
  name = "cookie"
351
365
  version = "0.18.1"
@@ -1418,18 +1432,18 @@ dependencies = [
1418
1432
 
1419
1433
  [[package]]
1420
1434
  name = "rb-sys"
1421
- version = "0.9.98"
1435
+ version = "0.9.99"
1422
1436
  source = "registry+https://github.com/rust-lang/crates.io-index"
1423
- checksum = "8914b2e6af10bd50dd7aaac8c5146872d3924d6012929b4ff504e988f6badd24"
1437
+ checksum = "d83151cfea2b67db2444f68c53b119ff77cff235ad711c765072e4daf8f3185b"
1424
1438
  dependencies = [
1425
1439
  "rb-sys-build",
1426
1440
  ]
1427
1441
 
1428
1442
  [[package]]
1429
1443
  name = "rb-sys-build"
1430
- version = "0.9.98"
1444
+ version = "0.9.99"
1431
1445
  source = "registry+https://github.com/rust-lang/crates.io-index"
1432
- checksum = "12af68c9757d419b82d65a12b5db538990dfe9416049fea3f0ba4b9a8ca108cd"
1446
+ checksum = "32d038214c118ad4a75db555ccb78672e17e1c5c10f344456cd129008dbaa7de"
1433
1447
  dependencies = [
1434
1448
  "bindgen",
1435
1449
  "lazy_static",
@@ -1786,14 +1800,14 @@ dependencies = [
1786
1800
 
1787
1801
  [[package]]
1788
1802
  name = "spider"
1789
- version = "1.99.8"
1803
+ version = "1.99.11"
1790
1804
  source = "registry+https://github.com/rust-lang/crates.io-index"
1791
- checksum = "525670cdc6aec8f4cb91da17ce0255050e89eb7c889272216d8a4fb644d67530"
1805
+ checksum = "e23ad22d5e55b09f480f849b37dd2fe315e3cf1df0f5261209aa5482483c617f"
1792
1806
  dependencies = [
1793
1807
  "ahash",
1794
1808
  "bytes",
1795
1809
  "case_insensitive_string",
1796
- "compact_str",
1810
+ "compact_str 0.8.0",
1797
1811
  "cssparser",
1798
1812
  "ego-tree",
1799
1813
  "fast_html5ever",
@@ -1817,9 +1831,9 @@ dependencies = [
1817
1831
 
1818
1832
  [[package]]
1819
1833
  name = "spider_cli"
1820
- version = "1.99.8"
1834
+ version = "1.99.11"
1821
1835
  source = "registry+https://github.com/rust-lang/crates.io-index"
1822
- checksum = "9bd9d95178dc0715608d5f28501c8321de3e14b40046c6584a12ffce96f0a676"
1836
+ checksum = "be5da7d570871156c08025bdc13de670807d36a90be94f8aa8342a04e5268662"
1823
1837
  dependencies = [
1824
1838
  "clap",
1825
1839
  "env_logger",
data/README.md CHANGED
@@ -42,6 +42,29 @@ The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), s
42
42
  | `delete_after_yield` | Whether the downloaded HTML files are deleted after the yield block finishes. | `true` |
43
43
  | `log_level` | The logging level. | `:warn` |
44
44
 
45
+ ### Processing pages
46
+
47
+ Vore processes HTML using handlers. By default, there are two:
48
+
49
+ * The `MetaExtractor`, which extracts information from your `title` and `meta` tags
50
+ * The `TagRemover`, which removes unnecessary elements like `header`, `footer`, `script`
51
+
52
+ If you wish to process the HTML further, you can provide your own handler:
53
+
54
+
55
+ ```ruby
56
+ Vore::Crawler.new(handlers: [MySpecialHandler.new])
57
+ ```
58
+
59
+ Handlers are defined using [Selma](https://github.com/gjtorikian/selma?tab=readme-ov-file#defining-handlers). Note that the `MetaExtractor` is always included and defined first, but if you pass in anything to the `handler` array, it'll overwrite Vore's other default handlers. You can of course choose to include them manually:
60
+
61
+
62
+ ```ruby
63
+ # preserve Vore's default content handler while adding your own;
64
+ # `MetaExtractor` is prefixed to the front
65
+ Vore::Crawler.new(handlers: [Vore::Handlers::TagRemover.new, MySpecialHandler.new])
66
+ ```
67
+
45
68
  ### In tests
46
69
 
47
70
  Since the actual HTTP calls occur in a separate process, Vore will not integrate with libraries like VCR or Webmock by default. You'll need to `require "vore/minitest_helper"` to get a function that emulates the HTTP `GET` requests in a way Ruby can interpret.
data/exe/vore-spider CHANGED
Binary file
@@ -3,6 +3,7 @@
3
3
  module Vore
4
4
  class Configuration
5
5
  DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
6
+ allow_comments: false,
6
7
  allow_doctype: false,
7
8
  })
8
9
 
data/lib/vore/crawler.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "handlers/content_extractor"
3
+ require_relative "handlers/meta_extractor"
4
+ require_relative "handlers/tag_remover"
4
5
 
5
6
  require "listen"
6
7
 
@@ -10,13 +11,20 @@ module Vore
10
11
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
11
12
  FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
12
13
 
13
- attr_reader :output_dir
14
+ attr_reader :handlers, :output_dir
14
15
 
15
16
  # Creates a crawler
16
17
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
17
- def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, options: {})
18
- @content_extractor = Vore::Handlers::ContentExtractor.new
19
- @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
18
+ def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {})
19
+ @meta_extractor = Vore::Handlers::MetaExtractor.new
20
+
21
+ @handlers = if handlers.nil?
22
+ [@meta_extractor, Vore::Handlers::TagRemover.new]
23
+ else
24
+ handlers.unshift(@meta_extractor)
25
+ end
26
+
27
+ @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: @handlers)
20
28
  ext = PLATFORM.include?("windows") ? ".exe" : ""
21
29
  @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
22
30
  @options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
@@ -70,34 +78,36 @@ module Vore
70
78
  @results[:pages_visited] += 1
71
79
 
72
80
  html_file = File.read(path).force_encoding("UTF-8")
73
- rewritten_html_file = ""
74
81
 
75
82
  if html_file.empty?
76
83
  @results[:unprocessed_pages] << path
77
84
  return
78
85
  end
79
86
 
80
- begin
81
- rewritten_html_file = @selma.rewrite(html_file)
82
- rescue StandardError => e
83
- Vore.logger.warn("Error rewriting #{path}: #{e}")
84
- @results[:unprocessed_pages] << path
85
- return
86
- end
87
+ rewritten_html_file = @selma.rewrite(html_file)
88
+ return if rewritten_html_file.empty?
87
89
 
88
90
  # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
89
91
  url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")
90
92
 
91
93
  page = Vore::PageData.new(
92
94
  content: rewritten_html_file,
93
- title: @content_extractor.title,
94
- meta: @content_extractor.meta,
95
+ title: @meta_extractor.title,
96
+ meta: @meta_extractor.meta,
95
97
  path: url_path,
96
98
  )
97
99
 
98
100
  yield page
99
101
  end
100
102
 
103
+ def rewrite(html_file)
104
+ @selma.rewrite(html_file)
105
+ rescue StandardError => e
106
+ Vore.logger.warn("Error rewriting #{path}: #{e}")
107
+ @results[:unprocessed_pages] << path
108
+ ""
109
+ end
110
+
101
111
  def run_command(website, delay: 0)
102
112
  pid = Process.spawn(
103
113
  @executable,
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Vore
4
4
  module Handlers
5
- class ContentExtractor
5
+ class MetaExtractor
6
6
  SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
7
7
 
8
8
  attr_reader :title, :meta
@@ -19,22 +19,14 @@ module Vore
19
19
  end
20
20
 
21
21
  def handle_element(element)
22
- if element.tag_name == "pre" ||
23
- element.tag_name == "form" ||
24
- element.tag_name == "style" ||
25
- element.tag_name == "noscript" ||
26
- element.tag_name == "script" ||
27
- element.tag_name == "svg"
28
- element.remove
29
- elsif element.tag_name == "title"
22
+ if element.tag_name == "title"
30
23
  @within_title = true
24
+
31
25
  element.remove
32
26
  elsif element.tag_name == "meta"
33
27
  return if element.attributes["name"].nil?
34
28
 
35
29
  @meta[element.attributes["name"]] = element.attributes["content"]
36
- else
37
- element.remove_and_keep_content
38
30
  end
39
31
  end
40
32
 
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ module Handlers
5
+ class TagRemover
6
+ SELECTOR = Selma::Selector.new(match_element: "*")
7
+
8
+ def selector
9
+ SELECTOR
10
+ end
11
+
12
+ UNNECESSARY_TAGS = [
13
+ # Remove code elements
14
+ "pre",
15
+
16
+ # Remove unnecessary elements
17
+ "head",
18
+
19
+ "form",
20
+ "style",
21
+ "noscript",
22
+ "script",
23
+ "svg",
24
+
25
+ # Remove unnecessary nav elements
26
+ "header",
27
+ "footer",
28
+ "nav",
29
+ "aside",
30
+ ]
31
+
32
+ CONTENT_TO_KEEP = [
33
+ "html",
34
+ "body",
35
+ ]
36
+
37
+ def handle_element(element)
38
+ if UNNECESSARY_TAGS.include?(element.tag_name)
39
+ element.remove
40
+ elsif CONTENT_TO_KEEP.include?(element.tag_name)
41
+ element.remove_and_keep_content
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.4.0"
4
+ VERSION = "0.5.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-22 00:00:00.000000000 Z
11
+ date: 2024-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: listen
@@ -60,7 +60,8 @@ files:
60
60
  - lib/vore.rb
61
61
  - lib/vore/configuration.rb
62
62
  - lib/vore/crawler.rb
63
- - lib/vore/handlers/content_extractor.rb
63
+ - lib/vore/handlers/meta_extractor.rb
64
+ - lib/vore/handlers/tag_remover.rb
64
65
  - lib/vore/logger.rb
65
66
  - lib/vore/minitest_helper.rb
66
67
  - lib/vore/page.rb