vore 0.4.0-arm64-darwin → 0.5.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +24 -10
- data/README.md +23 -0
- data/exe/vore-spider +0 -0
- data/lib/vore/configuration.rb +1 -0
- data/lib/vore/crawler.rb +25 -15
- data/lib/vore/handlers/{content_extractor.rb → meta_extractor.rb} +3 -11
- data/lib/vore/handlers/tag_remover.rb +46 -0
- data/lib/vore/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f7c02f426ad954cfa86f93607fd56fddf1587d010904d36ea627dc8ca11961e
|
4
|
+
data.tar.gz: 72ab10f3463093e373c55ff1f0cbbd4016bbd4cab9bb488040f209359e8eefde
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d1916963039e4faf9dab3af37289e2309a4f775a5dde0afd888fa1e4ca9aa225f044335949fdc10fe1b91e3504a3c966f4a2277706af6a0425b77c8e25792a2
|
7
|
+
data.tar.gz: 036cfc1bdf2b24052333edea4744cfd01ba5550f5dd302bb57fe0f0e6686d48cfc15a5f9532ad0c06c377d84552ab1ceb53ebaa4ca085985b92b2f4a4eaea55e
|
data/Cargo.lock
CHANGED
@@ -238,7 +238,7 @@ version = "0.2.3"
|
|
238
238
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
239
239
|
checksum = "fc229be27b394115abdc89e09500d5030407734d21a143a833eae5f136821bcd"
|
240
240
|
dependencies = [
|
241
|
-
"compact_str",
|
241
|
+
"compact_str 0.7.1",
|
242
242
|
"serde",
|
243
243
|
]
|
244
244
|
|
@@ -346,6 +346,20 @@ dependencies = [
|
|
346
346
|
"static_assertions",
|
347
347
|
]
|
348
348
|
|
349
|
+
[[package]]
|
350
|
+
name = "compact_str"
|
351
|
+
version = "0.8.0"
|
352
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
353
|
+
checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
|
354
|
+
dependencies = [
|
355
|
+
"castaway",
|
356
|
+
"cfg-if",
|
357
|
+
"itoa",
|
358
|
+
"rustversion",
|
359
|
+
"ryu",
|
360
|
+
"static_assertions",
|
361
|
+
]
|
362
|
+
|
349
363
|
[[package]]
|
350
364
|
name = "cookie"
|
351
365
|
version = "0.18.1"
|
@@ -1418,18 +1432,18 @@ dependencies = [
|
|
1418
1432
|
|
1419
1433
|
[[package]]
|
1420
1434
|
name = "rb-sys"
|
1421
|
-
version = "0.9.
|
1435
|
+
version = "0.9.99"
|
1422
1436
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1423
|
-
checksum = "
|
1437
|
+
checksum = "d83151cfea2b67db2444f68c53b119ff77cff235ad711c765072e4daf8f3185b"
|
1424
1438
|
dependencies = [
|
1425
1439
|
"rb-sys-build",
|
1426
1440
|
]
|
1427
1441
|
|
1428
1442
|
[[package]]
|
1429
1443
|
name = "rb-sys-build"
|
1430
|
-
version = "0.9.
|
1444
|
+
version = "0.9.99"
|
1431
1445
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1432
|
-
checksum = "
|
1446
|
+
checksum = "32d038214c118ad4a75db555ccb78672e17e1c5c10f344456cd129008dbaa7de"
|
1433
1447
|
dependencies = [
|
1434
1448
|
"bindgen",
|
1435
1449
|
"lazy_static",
|
@@ -1786,14 +1800,14 @@ dependencies = [
|
|
1786
1800
|
|
1787
1801
|
[[package]]
|
1788
1802
|
name = "spider"
|
1789
|
-
version = "1.99.
|
1803
|
+
version = "1.99.11"
|
1790
1804
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1791
|
-
checksum = "
|
1805
|
+
checksum = "e23ad22d5e55b09f480f849b37dd2fe315e3cf1df0f5261209aa5482483c617f"
|
1792
1806
|
dependencies = [
|
1793
1807
|
"ahash",
|
1794
1808
|
"bytes",
|
1795
1809
|
"case_insensitive_string",
|
1796
|
-
"compact_str",
|
1810
|
+
"compact_str 0.8.0",
|
1797
1811
|
"cssparser",
|
1798
1812
|
"ego-tree",
|
1799
1813
|
"fast_html5ever",
|
@@ -1817,9 +1831,9 @@ dependencies = [
|
|
1817
1831
|
|
1818
1832
|
[[package]]
|
1819
1833
|
name = "spider_cli"
|
1820
|
-
version = "1.99.
|
1834
|
+
version = "1.99.11"
|
1821
1835
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1822
|
-
checksum = "
|
1836
|
+
checksum = "be5da7d570871156c08025bdc13de670807d36a90be94f8aa8342a04e5268662"
|
1823
1837
|
dependencies = [
|
1824
1838
|
"clap",
|
1825
1839
|
"env_logger",
|
data/README.md
CHANGED
@@ -42,6 +42,29 @@ The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), s
|
|
42
42
|
| `delete_after_yield` | Whether the downloaded HTML files are deleted after the yield block finishes. | `true` |
|
43
43
|
| `log_level` | The logging level. | `:warn` |
|
44
44
|
|
45
|
+
### Processing pages
|
46
|
+
|
47
|
+
Vore processes HTML using handlers. By default, there are two:
|
48
|
+
|
49
|
+
* The `MetaExtractor`, which extracts information from your `title` and `meta` tags
|
50
|
+
* The `TagRemover`, which removes unnecessary elements like `header`, `footer`, `script`
|
51
|
+
|
52
|
+
If you wish to process the HTML further, you can provide your own handler:
|
53
|
+
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
Vore::Crawler.new(handlers: [MySpecialHandler.new])
|
57
|
+
```
|
58
|
+
|
59
|
+
Handlers are defined using [Selma](https://github.com/gjtorikian/selma?tab=readme-ov-file#defining-handlers). Note that the `MetaExtractor` is always included and defined first, but if you pass in anything to the `handler` array, it'll overwrite Vore's other default handlers. You can of course choose to include them manually:
|
60
|
+
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
# preserve Vore's default content handler while adding your own;
|
64
|
+
# `MetaExtractor` is prefixed to the front
|
65
|
+
Vore::Crawler.new(handlers: [Vore::Handlers::TagRemover.new, MySpecialHandler.new])
|
66
|
+
```
|
67
|
+
|
45
68
|
### In tests
|
46
69
|
|
47
70
|
Since the actual HTTP calls occur in a separate process, Vore will not integrate with libraries like VCR or Webmock by default. You'll need to `require "vore/minitest_helper"` to get a function that emulates the HTTP `GET` requests in a way Ruby can interpret.
|
data/exe/vore-spider
CHANGED
Binary file
|
data/lib/vore/configuration.rb
CHANGED
data/lib/vore/crawler.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative "handlers/
|
3
|
+
require_relative "handlers/meta_extractor"
|
4
|
+
require_relative "handlers/tag_remover"
|
4
5
|
|
5
6
|
require "listen"
|
6
7
|
|
@@ -10,13 +11,20 @@ module Vore
|
|
10
11
|
PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
|
11
12
|
FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
|
12
13
|
|
13
|
-
attr_reader :output_dir
|
14
|
+
attr_reader :handlers, :output_dir
|
14
15
|
|
15
16
|
# Creates a crawler
|
16
17
|
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
17
|
-
def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, options: {})
|
18
|
-
@
|
19
|
-
|
18
|
+
def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {})
|
19
|
+
@meta_extractor = Vore::Handlers::MetaExtractor.new
|
20
|
+
|
21
|
+
@handlers = if handlers.nil?
|
22
|
+
[@meta_extractor, Vore::Handlers::TagRemover.new]
|
23
|
+
else
|
24
|
+
handlers.unshift(@meta_extractor)
|
25
|
+
end
|
26
|
+
|
27
|
+
@selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: @handlers)
|
20
28
|
ext = PLATFORM.include?("windows") ? ".exe" : ""
|
21
29
|
@executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
|
22
30
|
@options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
|
@@ -70,34 +78,36 @@ module Vore
|
|
70
78
|
@results[:pages_visited] += 1
|
71
79
|
|
72
80
|
html_file = File.read(path).force_encoding("UTF-8")
|
73
|
-
rewritten_html_file = ""
|
74
81
|
|
75
82
|
if html_file.empty?
|
76
83
|
@results[:unprocessed_pages] << path
|
77
84
|
return
|
78
85
|
end
|
79
86
|
|
80
|
-
|
81
|
-
|
82
|
-
rescue StandardError => e
|
83
|
-
Vore.logger.warn("Error rewriting #{path}: #{e}")
|
84
|
-
@results[:unprocessed_pages] << path
|
85
|
-
return
|
86
|
-
end
|
87
|
+
rewritten_html_file = @selma.rewrite(html_file)
|
88
|
+
return if rewritten_html_file.empty?
|
87
89
|
|
88
90
|
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
89
91
|
url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")
|
90
92
|
|
91
93
|
page = Vore::PageData.new(
|
92
94
|
content: rewritten_html_file,
|
93
|
-
title: @
|
94
|
-
meta: @
|
95
|
+
title: @meta_extractor.title,
|
96
|
+
meta: @meta_extractor.meta,
|
95
97
|
path: url_path,
|
96
98
|
)
|
97
99
|
|
98
100
|
yield page
|
99
101
|
end
|
100
102
|
|
103
|
+
def rewrite(html_file)
|
104
|
+
@selma.rewrite(html_file)
|
105
|
+
rescue StandardError => e
|
106
|
+
Vore.logger.warn("Error rewriting #{path}: #{e}")
|
107
|
+
@results[:unprocessed_pages] << path
|
108
|
+
""
|
109
|
+
end
|
110
|
+
|
101
111
|
def run_command(website, delay: 0)
|
102
112
|
pid = Process.spawn(
|
103
113
|
@executable,
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
module Vore
|
4
4
|
module Handlers
|
5
|
-
class
|
5
|
+
class MetaExtractor
|
6
6
|
SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
|
7
7
|
|
8
8
|
attr_reader :title, :meta
|
@@ -19,22 +19,14 @@ module Vore
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def handle_element(element)
|
22
|
-
if element.tag_name == "
|
23
|
-
element.tag_name == "form" ||
|
24
|
-
element.tag_name == "style" ||
|
25
|
-
element.tag_name == "noscript" ||
|
26
|
-
element.tag_name == "script" ||
|
27
|
-
element.tag_name == "svg"
|
28
|
-
element.remove
|
29
|
-
elsif element.tag_name == "title"
|
22
|
+
if element.tag_name == "title"
|
30
23
|
@within_title = true
|
24
|
+
|
31
25
|
element.remove
|
32
26
|
elsif element.tag_name == "meta"
|
33
27
|
return if element.attributes["name"].nil?
|
34
28
|
|
35
29
|
@meta[element.attributes["name"]] = element.attributes["content"]
|
36
|
-
else
|
37
|
-
element.remove_and_keep_content
|
38
30
|
end
|
39
31
|
end
|
40
32
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Vore
|
4
|
+
module Handlers
|
5
|
+
class TagRemover
|
6
|
+
SELECTOR = Selma::Selector.new(match_element: "*")
|
7
|
+
|
8
|
+
def selector
|
9
|
+
SELECTOR
|
10
|
+
end
|
11
|
+
|
12
|
+
UNNECESSARY_TAGS = [
|
13
|
+
# Remove code elements
|
14
|
+
"pre",
|
15
|
+
|
16
|
+
# Remove unnecessary elements
|
17
|
+
"head",
|
18
|
+
|
19
|
+
"form",
|
20
|
+
"style",
|
21
|
+
"noscript",
|
22
|
+
"script",
|
23
|
+
"svg",
|
24
|
+
|
25
|
+
# Remove unnecessary nav elements
|
26
|
+
"header",
|
27
|
+
"footer",
|
28
|
+
"nav",
|
29
|
+
"aside",
|
30
|
+
]
|
31
|
+
|
32
|
+
CONTENT_TO_KEEP = [
|
33
|
+
"html",
|
34
|
+
"body",
|
35
|
+
]
|
36
|
+
|
37
|
+
def handle_element(element)
|
38
|
+
if UNNECESSARY_TAGS.include?(element.tag_name)
|
39
|
+
element.remove
|
40
|
+
elsif CONTENT_TO_KEEP.include?(element.tag_name)
|
41
|
+
element.remove_and_keep_content
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/vore/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: arm64-darwin
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-07-
|
11
|
+
date: 2024-07-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: listen
|
@@ -60,7 +60,8 @@ files:
|
|
60
60
|
- lib/vore.rb
|
61
61
|
- lib/vore/configuration.rb
|
62
62
|
- lib/vore/crawler.rb
|
63
|
-
- lib/vore/handlers/
|
63
|
+
- lib/vore/handlers/meta_extractor.rb
|
64
|
+
- lib/vore/handlers/tag_remover.rb
|
64
65
|
- lib/vore/logger.rb
|
65
66
|
- lib/vore/minitest_helper.rb
|
66
67
|
- lib/vore/page.rb
|