vore 0.4.0-x86_64-darwin → 0.5.0-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +24 -10
- data/README.md +23 -0
- data/exe/vore-spider +0 -0
- data/lib/vore/configuration.rb +1 -0
- data/lib/vore/crawler.rb +25 -15
- data/lib/vore/handlers/{content_extractor.rb → meta_extractor.rb} +3 -11
- data/lib/vore/handlers/tag_remover.rb +46 -0
- data/lib/vore/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 375d0bfc0c775c2be95de44e7d5a750da641f00332490deace0863b569331cb6
|
4
|
+
data.tar.gz: 9e39c8fbb2b9813e86138c76364e5de9539390d8569e76f35bdb7dcd25a18cae
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8ed087d41652d299c514290f16dcf88e65f38d2c719dc3dc914b8ad1f9cd6674aca09053310b1cd08aeed256428fd98cabe2e9cdc5c93e0fa27838393c04759
|
7
|
+
data.tar.gz: 94cc7f34e9c8cccbf85e24b1ad38dd17e91fb0f736838422c61a09532aefa323973cc410c3f72d52939b6e75d909f703ab8c93e2b4da3195721102cedd4760e1
|
data/Cargo.lock
CHANGED
@@ -238,7 +238,7 @@ version = "0.2.3"
|
|
238
238
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
239
239
|
checksum = "fc229be27b394115abdc89e09500d5030407734d21a143a833eae5f136821bcd"
|
240
240
|
dependencies = [
|
241
|
-
"compact_str",
|
241
|
+
"compact_str 0.7.1",
|
242
242
|
"serde",
|
243
243
|
]
|
244
244
|
|
@@ -346,6 +346,20 @@ dependencies = [
|
|
346
346
|
"static_assertions",
|
347
347
|
]
|
348
348
|
|
349
|
+
[[package]]
|
350
|
+
name = "compact_str"
|
351
|
+
version = "0.8.0"
|
352
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
353
|
+
checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
|
354
|
+
dependencies = [
|
355
|
+
"castaway",
|
356
|
+
"cfg-if",
|
357
|
+
"itoa",
|
358
|
+
"rustversion",
|
359
|
+
"ryu",
|
360
|
+
"static_assertions",
|
361
|
+
]
|
362
|
+
|
349
363
|
[[package]]
|
350
364
|
name = "cookie"
|
351
365
|
version = "0.18.1"
|
@@ -1418,18 +1432,18 @@ dependencies = [
|
|
1418
1432
|
|
1419
1433
|
[[package]]
|
1420
1434
|
name = "rb-sys"
|
1421
|
-
version = "0.9.
|
1435
|
+
version = "0.9.99"
|
1422
1436
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1423
|
-
checksum = "
|
1437
|
+
checksum = "d83151cfea2b67db2444f68c53b119ff77cff235ad711c765072e4daf8f3185b"
|
1424
1438
|
dependencies = [
|
1425
1439
|
"rb-sys-build",
|
1426
1440
|
]
|
1427
1441
|
|
1428
1442
|
[[package]]
|
1429
1443
|
name = "rb-sys-build"
|
1430
|
-
version = "0.9.
|
1444
|
+
version = "0.9.99"
|
1431
1445
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1432
|
-
checksum = "
|
1446
|
+
checksum = "32d038214c118ad4a75db555ccb78672e17e1c5c10f344456cd129008dbaa7de"
|
1433
1447
|
dependencies = [
|
1434
1448
|
"bindgen",
|
1435
1449
|
"lazy_static",
|
@@ -1786,14 +1800,14 @@ dependencies = [
|
|
1786
1800
|
|
1787
1801
|
[[package]]
|
1788
1802
|
name = "spider"
|
1789
|
-
version = "1.99.
|
1803
|
+
version = "1.99.11"
|
1790
1804
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1791
|
-
checksum = "
|
1805
|
+
checksum = "e23ad22d5e55b09f480f849b37dd2fe315e3cf1df0f5261209aa5482483c617f"
|
1792
1806
|
dependencies = [
|
1793
1807
|
"ahash",
|
1794
1808
|
"bytes",
|
1795
1809
|
"case_insensitive_string",
|
1796
|
-
"compact_str",
|
1810
|
+
"compact_str 0.8.0",
|
1797
1811
|
"cssparser",
|
1798
1812
|
"ego-tree",
|
1799
1813
|
"fast_html5ever",
|
@@ -1817,9 +1831,9 @@ dependencies = [
|
|
1817
1831
|
|
1818
1832
|
[[package]]
|
1819
1833
|
name = "spider_cli"
|
1820
|
-
version = "1.99.
|
1834
|
+
version = "1.99.11"
|
1821
1835
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1822
|
-
checksum = "
|
1836
|
+
checksum = "be5da7d570871156c08025bdc13de670807d36a90be94f8aa8342a04e5268662"
|
1823
1837
|
dependencies = [
|
1824
1838
|
"clap",
|
1825
1839
|
"env_logger",
|
data/README.md
CHANGED
@@ -42,6 +42,29 @@ The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), s
|
|
42
42
|
| `delete_after_yield` | Whether the downloaded HTML files are deleted after the yield block finishes. | `true` |
|
43
43
|
| `log_level` | The logging level. | `:warn` |
|
44
44
|
|
45
|
+
### Processing pages
|
46
|
+
|
47
|
+
Vore processes HTML using handlers. By default, there are two:
|
48
|
+
|
49
|
+
* The `MetaExtractor`, which extracts information from your `title` and `meta` tags
|
50
|
+
* The `TagRemover`, which removes unnecessary elements like `header`, `footer`, `script`
|
51
|
+
|
52
|
+
If you wish to process the HTML further, you can provide your own handler:
|
53
|
+
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
Vore::Crawler.new(handlers: [MySpecialHandler.new])
|
57
|
+
```
|
58
|
+
|
59
|
+
Handlers are defined using [Selma](https://github.com/gjtorikian/selma?tab=readme-ov-file#defining-handlers). Note that the `MetaExtractor` is always included and defined first, but if you pass in anything to the `handler` array, it'll overwrite Vore's other default handlers. You can of course choose to include them manually:
|
60
|
+
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
# preserve Vore's default content handler while adding your own;
|
64
|
+
# `MetaExtractor` is prefixed to the front
|
65
|
+
Vore::Crawler.new(handlers: [Vore::Handlers::TagRemover.new, MySpecialHandler.new])
|
66
|
+
```
|
67
|
+
|
45
68
|
### In tests
|
46
69
|
|
47
70
|
Since the actual HTTP calls occur in a separate process, Vore will not integrate with libraries like VCR or Webmock by default. You'll need to `require "vore/minitest_helper"` to get a function that emulates the HTTP `GET` requests in a way Ruby can interpret.
|
data/exe/vore-spider
CHANGED
Binary file
|
data/lib/vore/configuration.rb
CHANGED
data/lib/vore/crawler.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative "handlers/
|
3
|
+
require_relative "handlers/meta_extractor"
|
4
|
+
require_relative "handlers/tag_remover"
|
4
5
|
|
5
6
|
require "listen"
|
6
7
|
|
@@ -10,13 +11,20 @@ module Vore
|
|
10
11
|
PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
|
11
12
|
FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
|
12
13
|
|
13
|
-
attr_reader :output_dir
|
14
|
+
attr_reader :handlers, :output_dir
|
14
15
|
|
15
16
|
# Creates a crawler
|
16
17
|
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
17
|
-
def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, options: {})
|
18
|
-
@
|
19
|
-
|
18
|
+
def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {})
|
19
|
+
@meta_extractor = Vore::Handlers::MetaExtractor.new
|
20
|
+
|
21
|
+
@handlers = if handlers.nil?
|
22
|
+
[@meta_extractor, Vore::Handlers::TagRemover.new]
|
23
|
+
else
|
24
|
+
handlers.unshift(@meta_extractor)
|
25
|
+
end
|
26
|
+
|
27
|
+
@selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: @handlers)
|
20
28
|
ext = PLATFORM.include?("windows") ? ".exe" : ""
|
21
29
|
@executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
|
22
30
|
@options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
|
@@ -70,34 +78,36 @@ module Vore
|
|
70
78
|
@results[:pages_visited] += 1
|
71
79
|
|
72
80
|
html_file = File.read(path).force_encoding("UTF-8")
|
73
|
-
rewritten_html_file = ""
|
74
81
|
|
75
82
|
if html_file.empty?
|
76
83
|
@results[:unprocessed_pages] << path
|
77
84
|
return
|
78
85
|
end
|
79
86
|
|
80
|
-
|
81
|
-
|
82
|
-
rescue StandardError => e
|
83
|
-
Vore.logger.warn("Error rewriting #{path}: #{e}")
|
84
|
-
@results[:unprocessed_pages] << path
|
85
|
-
return
|
86
|
-
end
|
87
|
+
rewritten_html_file = @selma.rewrite(html_file)
|
88
|
+
return if rewritten_html_file.empty?
|
87
89
|
|
88
90
|
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
89
91
|
url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")
|
90
92
|
|
91
93
|
page = Vore::PageData.new(
|
92
94
|
content: rewritten_html_file,
|
93
|
-
title: @
|
94
|
-
meta: @
|
95
|
+
title: @meta_extractor.title,
|
96
|
+
meta: @meta_extractor.meta,
|
95
97
|
path: url_path,
|
96
98
|
)
|
97
99
|
|
98
100
|
yield page
|
99
101
|
end
|
100
102
|
|
103
|
+
def rewrite(html_file)
|
104
|
+
@selma.rewrite(html_file)
|
105
|
+
rescue StandardError => e
|
106
|
+
Vore.logger.warn("Error rewriting #{path}: #{e}")
|
107
|
+
@results[:unprocessed_pages] << path
|
108
|
+
""
|
109
|
+
end
|
110
|
+
|
101
111
|
def run_command(website, delay: 0)
|
102
112
|
pid = Process.spawn(
|
103
113
|
@executable,
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
module Vore
|
4
4
|
module Handlers
|
5
|
-
class
|
5
|
+
class MetaExtractor
|
6
6
|
SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
|
7
7
|
|
8
8
|
attr_reader :title, :meta
|
@@ -19,22 +19,14 @@ module Vore
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def handle_element(element)
|
22
|
-
if element.tag_name == "
|
23
|
-
element.tag_name == "form" ||
|
24
|
-
element.tag_name == "style" ||
|
25
|
-
element.tag_name == "noscript" ||
|
26
|
-
element.tag_name == "script" ||
|
27
|
-
element.tag_name == "svg"
|
28
|
-
element.remove
|
29
|
-
elsif element.tag_name == "title"
|
22
|
+
if element.tag_name == "title"
|
30
23
|
@within_title = true
|
24
|
+
|
31
25
|
element.remove
|
32
26
|
elsif element.tag_name == "meta"
|
33
27
|
return if element.attributes["name"].nil?
|
34
28
|
|
35
29
|
@meta[element.attributes["name"]] = element.attributes["content"]
|
36
|
-
else
|
37
|
-
element.remove_and_keep_content
|
38
30
|
end
|
39
31
|
end
|
40
32
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Vore
|
4
|
+
module Handlers
|
5
|
+
class TagRemover
|
6
|
+
SELECTOR = Selma::Selector.new(match_element: "*")
|
7
|
+
|
8
|
+
def selector
|
9
|
+
SELECTOR
|
10
|
+
end
|
11
|
+
|
12
|
+
UNNECESSARY_TAGS = [
|
13
|
+
# Remove code elements
|
14
|
+
"pre",
|
15
|
+
|
16
|
+
# Remove unnecessary elements
|
17
|
+
"head",
|
18
|
+
|
19
|
+
"form",
|
20
|
+
"style",
|
21
|
+
"noscript",
|
22
|
+
"script",
|
23
|
+
"svg",
|
24
|
+
|
25
|
+
# Remove unnecessary nav elements
|
26
|
+
"header",
|
27
|
+
"footer",
|
28
|
+
"nav",
|
29
|
+
"aside",
|
30
|
+
]
|
31
|
+
|
32
|
+
CONTENT_TO_KEEP = [
|
33
|
+
"html",
|
34
|
+
"body",
|
35
|
+
]
|
36
|
+
|
37
|
+
def handle_element(element)
|
38
|
+
if UNNECESSARY_TAGS.include?(element.tag_name)
|
39
|
+
element.remove
|
40
|
+
elsif CONTENT_TO_KEEP.include?(element.tag_name)
|
41
|
+
element.remove_and_keep_content
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/vore/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-07-
|
11
|
+
date: 2024-07-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: listen
|
@@ -60,7 +60,8 @@ files:
|
|
60
60
|
- lib/vore.rb
|
61
61
|
- lib/vore/configuration.rb
|
62
62
|
- lib/vore/crawler.rb
|
63
|
-
- lib/vore/handlers/
|
63
|
+
- lib/vore/handlers/meta_extractor.rb
|
64
|
+
- lib/vore/handlers/tag_remover.rb
|
64
65
|
- lib/vore/logger.rb
|
65
66
|
- lib/vore/minitest_helper.rb
|
66
67
|
- lib/vore/page.rb
|