vore 0.2.8-x86_64-linux → 0.4.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +4 -4
- data/README.md +67 -5
- data/exe/vore-spider +0 -0
- data/lib/vore/configuration.rb +8 -1
- data/lib/vore/crawler.rb +78 -57
- data/lib/vore/handlers/content_extractor.rb +1 -1
- data/lib/vore/logger.rb +4 -0
- data/lib/vore/minitest_helper.rb +75 -0
- data/lib/vore/version.rb +1 -1
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8bf0b076a7ec664aaf3b89dcc04ef818c0e001471e0030203ab740e43cf1f234
|
4
|
+
data.tar.gz: 86042da1109b675a3a0ed0aff3b59dc48bb2a066ee809cd9ab5535e9d137ed4b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bbf8c117fe9ccea10de4048f279b8e95feb6e31ba99af751dfd19eb7a9dfab60a46ce1aad7801824891211dad07c6eb5f53fc7b332ef26758f9be71c3cd02a74
|
7
|
+
data.tar.gz: b8d4220844655b5fdd7ab033dcad91d76c9872629f1555542d6a6d2e5f706ea50fe32b70ec38587ed8147858f60e9c9bfb26768067dacd590a44d8732c5a2811
|
data/Cargo.lock
CHANGED
@@ -1786,9 +1786,9 @@ dependencies = [
|
|
1786
1786
|
|
1787
1787
|
[[package]]
|
1788
1788
|
name = "spider"
|
1789
|
-
version = "1.99.
|
1789
|
+
version = "1.99.8"
|
1790
1790
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1791
|
-
checksum = "
|
1791
|
+
checksum = "525670cdc6aec8f4cb91da17ce0255050e89eb7c889272216d8a4fb644d67530"
|
1792
1792
|
dependencies = [
|
1793
1793
|
"ahash",
|
1794
1794
|
"bytes",
|
@@ -1817,9 +1817,9 @@ dependencies = [
|
|
1817
1817
|
|
1818
1818
|
[[package]]
|
1819
1819
|
name = "spider_cli"
|
1820
|
-
version = "1.99.
|
1820
|
+
version = "1.99.8"
|
1821
1821
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1822
|
-
checksum = "
|
1822
|
+
checksum = "9bd9d95178dc0715608d5f28501c8321de3e14b40046c6584a12ffce96f0a676"
|
1823
1823
|
dependencies = [
|
1824
1824
|
"clap",
|
1825
1825
|
"env_logger",
|
data/README.md
CHANGED
@@ -18,13 +18,75 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
18
18
|
## Usage
|
19
19
|
|
20
20
|
```ruby
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
21
|
+
crawler = Vore::Crawler.new
|
22
|
+
crawler.scrape_each_page("https://choosealicense.com") do |page|
|
23
|
+
puts page
|
24
|
+
end
|
25
25
|
```
|
26
26
|
|
27
|
-
Each `page` is
|
27
|
+
Each `page` is a simple class consisting of the following values:
|
28
|
+
|
29
|
+
* `content`: the text of the HTML document, sans tags
|
30
|
+
* `title`: the title of the HTML document (if any)
|
31
|
+
* `meta`: the document's meta tags (if any)
|
32
|
+
* `path`: the document's path
|
33
|
+
|
34
|
+
The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), so you know it's fast.
|
35
|
+
|
36
|
+
### Configuration
|
37
|
+
|
38
|
+
| Name | Description | Default |
|
39
|
+
| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
|
40
|
+
| `delay` | A value (in milliseconds) which introduces an artifical delay when crawling. Useful for situations where there's rate limiting involved. | `0` |
|
41
|
+
| `output_dir` | Where the resulting HTML files are stored. | `"tmp/vore"` |
|
42
|
+
| `delete_after_yield` | Whether the downloaded HTML files are deleted after the yield block finishes. | `true` |
|
43
|
+
| `log_level` | The logging level. | `:warn` |
|
44
|
+
|
45
|
+
### In tests
|
46
|
+
|
47
|
+
Since the actual HTTP calls occur in a separate process, Vore will not integrate with libraries like VCR or Webmock by default. You'll need to `require "vore/minitest_helper"` to get a function that emulates the HTTP `GET` requests in a way Ruby can interpret.
|
48
|
+
|
49
|
+
Based on your needs, you can overwrite any of the existing methods to suit your application's needs. For example, if you prefer HTML to be generated by Faker, you can create and require a file that looks like the following:
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
|
53
|
+
require "vore/minitest_helper"
|
54
|
+
|
55
|
+
module Vore
|
56
|
+
module TestHelperExtension
|
57
|
+
DOCUMENT_TITLES = [
|
58
|
+
"Hello, I need help",
|
59
|
+
"I need to update my payment information",
|
60
|
+
]
|
61
|
+
DOCUMENT_CONTENT = [
|
62
|
+
"Hey, I'm having trouble with my computer. Can you help me?",
|
63
|
+
# v--- always creates three page chunks
|
64
|
+
"I need to update my payment information. Like, now. Right now. Now. Can you help me? Please? Now?" + "Can you help me? Please? Now?" * 100,
|
65
|
+
]
|
66
|
+
|
67
|
+
def content
|
68
|
+
@counter = -1 unless defined?(@counter)
|
69
|
+
@counter += 1
|
70
|
+
|
71
|
+
html = "<!DOCTYPE html><html><head><title>#{DOCUMENT_TITLES[@counter]}</title>"
|
72
|
+
|
73
|
+
meta_tag_count.times do # arbitrarily set to 5
|
74
|
+
html += "<meta name=\"#{Faker::Lorem.word}\" content=\"#{Faker::Lorem.word}\" />"
|
75
|
+
end
|
76
|
+
|
77
|
+
html += "</head><body>"
|
78
|
+
|
79
|
+
html += "<p>#{DOCUMENT_CONTENT[@counter]}</p>"
|
80
|
+
|
81
|
+
html += "</body></html>"
|
82
|
+
|
83
|
+
html
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
Vore::TestHelper.prepend(Vore::TestHelperExtension)
|
88
|
+
end
|
89
|
+
```
|
28
90
|
|
29
91
|
## Development
|
30
92
|
|
data/exe/vore-spider
CHANGED
Binary file
|
data/lib/vore/configuration.rb
CHANGED
@@ -1,9 +1,16 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
module
|
3
|
+
module Vore
|
4
4
|
class Configuration
|
5
5
|
DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
|
6
6
|
allow_doctype: false,
|
7
7
|
})
|
8
|
+
|
9
|
+
DEFAULT_OPTIONS = {
|
10
|
+
delay: 0,
|
11
|
+
output_dir: "tmp/vore",
|
12
|
+
delete_after_yield: true,
|
13
|
+
log_level: :warn,
|
14
|
+
}
|
8
15
|
end
|
9
16
|
end
|
data/lib/vore/crawler.rb
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
require_relative "handlers/content_extractor"
|
4
4
|
|
5
|
+
require "listen"
|
6
|
+
|
5
7
|
module Vore
|
6
8
|
# This is the class that starts and controls the crawling
|
7
9
|
class Crawler
|
@@ -12,14 +14,22 @@ module Vore
|
|
12
14
|
|
13
15
|
# Creates a crawler
|
14
16
|
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
15
|
-
def initialize(
|
16
|
-
@
|
17
|
-
|
18
|
-
@content_extractor = Vole::Handlers::ContentExtractor.new
|
17
|
+
def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, options: {})
|
18
|
+
@content_extractor = Vore::Handlers::ContentExtractor.new
|
19
19
|
@selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
|
20
20
|
ext = PLATFORM.include?("windows") ? ".exe" : ""
|
21
21
|
@executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
|
22
|
-
@
|
22
|
+
@options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
|
23
|
+
@parent_output_dir = @options[:output_dir]
|
24
|
+
@parent_output_dir_len = @parent_output_dir.to_s.split(FILE_SEPERATOR).size
|
25
|
+
|
26
|
+
Vore.logger.level = @options[:log_level]
|
27
|
+
Listen.logger = Vore.logger
|
28
|
+
|
29
|
+
@results = {
|
30
|
+
pages_visited: 0,
|
31
|
+
unprocessed_pages: [],
|
32
|
+
}
|
23
33
|
|
24
34
|
return if File.exist?(@executable)
|
25
35
|
|
@@ -29,71 +39,82 @@ module Vore
|
|
29
39
|
|
30
40
|
def scrape_each_page(website, &block)
|
31
41
|
@output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
|
32
|
-
|
33
|
-
|
34
|
-
|
42
|
+
FileUtils.rm_rf(@output_dir)
|
43
|
+
FileUtils.mkdir_p(@output_dir)
|
44
|
+
|
45
|
+
listener = Listen.to(@output_dir) do |_modified, added, _removed|
|
46
|
+
if added.any?
|
47
|
+
added.each do |path|
|
48
|
+
process_file(path, &block)
|
49
|
+
File.delete(path) if @options[:delete_after_yield]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
listener.start
|
35
54
|
|
36
|
-
Vore.logger.info("Vore
|
55
|
+
Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
|
37
56
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
57
|
+
begin
|
58
|
+
run_command(website, delay: @options[:delay])
|
59
|
+
ensure
|
60
|
+
sleep(0.5) # give listener time to clean up
|
61
|
+
listener.stop
|
62
|
+
end
|
43
63
|
|
44
|
-
|
45
|
-
next unless File.file?(path)
|
64
|
+
Vore.logger.info("Vore finished crawling #{website}")
|
46
65
|
|
47
|
-
|
66
|
+
@results
|
67
|
+
end
|
48
68
|
|
49
|
-
|
50
|
-
|
69
|
+
def process_file(path, &block)
|
70
|
+
@results[:pages_visited] += 1
|
51
71
|
|
52
|
-
|
53
|
-
|
54
|
-
results[:unprocessed_pages] << path
|
55
|
-
next
|
56
|
-
end
|
72
|
+
html_file = File.read(path).force_encoding("UTF-8")
|
73
|
+
rewritten_html_file = ""
|
57
74
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
results[:pages_unprocessed] += 1
|
63
|
-
next
|
64
|
-
end
|
75
|
+
if html_file.empty?
|
76
|
+
@results[:unprocessed_pages] << path
|
77
|
+
return
|
78
|
+
end
|
65
79
|
|
66
|
-
|
67
|
-
|
80
|
+
begin
|
81
|
+
rewritten_html_file = @selma.rewrite(html_file)
|
82
|
+
rescue StandardError => e
|
83
|
+
Vore.logger.warn("Error rewriting #{path}: #{e}")
|
84
|
+
@results[:unprocessed_pages] << path
|
85
|
+
return
|
86
|
+
end
|
68
87
|
|
69
|
-
|
70
|
-
|
71
|
-
title: @content_extractor.title,
|
72
|
-
meta: @content_extractor.meta,
|
73
|
-
path: url_path,
|
74
|
-
)
|
88
|
+
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
89
|
+
url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")
|
75
90
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
91
|
+
page = Vore::PageData.new(
|
92
|
+
content: rewritten_html_file,
|
93
|
+
title: @content_extractor.title,
|
94
|
+
meta: @content_extractor.meta,
|
95
|
+
path: url_path,
|
96
|
+
)
|
80
97
|
|
81
|
-
|
98
|
+
yield page
|
82
99
|
end
|
83
100
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
--
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
101
|
+
def run_command(website, delay: 0)
|
102
|
+
pid = Process.spawn(
|
103
|
+
@executable,
|
104
|
+
"--user-agent",
|
105
|
+
user_agent,
|
106
|
+
"--delay",
|
107
|
+
delay.to_s,
|
108
|
+
"--url",
|
109
|
+
website,
|
110
|
+
"download",
|
111
|
+
"-t",
|
112
|
+
@output_dir,
|
113
|
+
)
|
114
|
+
|
115
|
+
_, _status = Process.waitpid2(pid)
|
116
|
+
rescue StandardError => e
|
117
|
+
Vore.logger.error(e)
|
97
118
|
end
|
98
119
|
|
99
120
|
def user_agent
|
data/lib/vore/logger.rb
CHANGED
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "minitest/mock"
|
4
|
+
require "net/http"
|
5
|
+
|
6
|
+
module Vore
|
7
|
+
module TestHelper
|
8
|
+
def run_command(website, **options)
|
9
|
+
loop_times.times do |time|
|
10
|
+
net_http = ::Minitest::Mock.new
|
11
|
+
response = ::Minitest::Mock.new
|
12
|
+
response.expect(:is_a?, true, [::Net::HTTPSuccess])
|
13
|
+
|
14
|
+
# we need to trigger an HTTP call to pretend that we're making
|
15
|
+
# an external request. this way, the gem hooks into VCR/Webmock
|
16
|
+
net_http.expect(:get, response)
|
17
|
+
html = content
|
18
|
+
response.expect(:body, html)
|
19
|
+
time_s = time.to_s
|
20
|
+
uri = URI("#{website}/#{time_s}")
|
21
|
+
Net::HTTP.get(uri)
|
22
|
+
|
23
|
+
file = File.join(@output_dir, time_s)
|
24
|
+
File.write("#{file}.html", html)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def loop_times=(times)
|
29
|
+
@loop_times = times
|
30
|
+
end
|
31
|
+
|
32
|
+
def loop_times
|
33
|
+
@loop_times ||= 5
|
34
|
+
end
|
35
|
+
|
36
|
+
def meta_tag_count=(count)
|
37
|
+
@meta_tag_count = count
|
38
|
+
end
|
39
|
+
|
40
|
+
def meta_tag_count
|
41
|
+
@meta_tag_count ||= 5
|
42
|
+
end
|
43
|
+
|
44
|
+
def generate_word
|
45
|
+
("a".."z").to_a.sample(8).join
|
46
|
+
end
|
47
|
+
|
48
|
+
def generate_sentence
|
49
|
+
Array.new((5..15).to_a.sample) { generate_word }.join(" ")
|
50
|
+
end
|
51
|
+
|
52
|
+
def generate_path
|
53
|
+
Array.new((1..3).to_a.sample) { generate_word }.join("/")
|
54
|
+
end
|
55
|
+
|
56
|
+
def content
|
57
|
+
html = "<!DOCTYPE html><html><head><title>#{generate_word}</title>"
|
58
|
+
meta_tag_count.times do
|
59
|
+
html += "<meta name=\"#{generate_word}\" content=\"#{generate_word}\" />"
|
60
|
+
end
|
61
|
+
|
62
|
+
html += "</head><body>"
|
63
|
+
|
64
|
+
50.times do
|
65
|
+
tagname = ["p", "h1", "h2", "h3", "h4", "h5", "h6"].sample
|
66
|
+
html += "<#{tagname}>#{generate_sentence}</#{tagname}>"
|
67
|
+
end
|
68
|
+
|
69
|
+
html += "</body></html>"
|
70
|
+
html
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
Vore::Crawler.prepend(Vore::TestHelper)
|
75
|
+
end
|
data/lib/vore/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,30 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: x86_64-linux
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-07-
|
11
|
+
date: 2024-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: listen
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.9'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.9'
|
27
|
+
force_ruby_platform: false
|
13
28
|
- !ruby/object:Gem::Dependency
|
14
29
|
name: selma
|
15
30
|
requirement: !ruby/object:Gem::Requirement
|
@@ -47,6 +62,7 @@ files:
|
|
47
62
|
- lib/vore/crawler.rb
|
48
63
|
- lib/vore/handlers/content_extractor.rb
|
49
64
|
- lib/vore/logger.rb
|
65
|
+
- lib/vore/minitest_helper.rb
|
50
66
|
- lib/vore/page.rb
|
51
67
|
- lib/vore/page_data.rb
|
52
68
|
- lib/vore/version.rb
|