vore 0.1.1.4-x86_64-darwin → 0.2.1-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/exe/vore-spider +0 -0
- data/lib/vore/crawler.rb +20 -7
- data/lib/vore/handlers/content_extractor.rb +25 -2
- data/lib/vore/page_data.rb +14 -0
- data/lib/vore/version.rb +1 -1
- data/lib/vore.rb +3 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 31497cb9cec566256acca79e43bfb5b6fe8791bea446bba60ca895f2be3ab91b
|
4
|
+
data.tar.gz: 2252352ba0f823215117c143d0608431bfd6e0037f9b122f6711c191dc7f99c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a0f51ccfc30cb95ff87b6275c033b343ca7c07994f9937fad18a670d8973dd2c0d2db8c40f5849d2e32a372c1e755e97b3489b46532bd9c0317e7bfdc88786e0
|
7
|
+
data.tar.gz: 9517a87119025f30bdb105fa89440990ec260b32db10e497a6775271fec547358a4552edf6033508d27225a12427bd61272b13e0585824802c4490d7187173de
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
![Vore, by LewdBacon](https://github.com/user-attachments/assets/0923cc84-4cca-4d95-8a0e-4dad650525d2)
|
4
4
|
|
5
|
-
Vore
|
5
|
+
Vore quickly crawls websites and spits out text sans tags. It's written in Ruby and powered by Rust.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
data/exe/vore-spider
CHANGED
Binary file
|
data/lib/vore/crawler.rb
CHANGED
@@ -6,15 +6,18 @@ module Vore
|
|
6
6
|
# This is the class that starts and controls the crawling
|
7
7
|
class Crawler
|
8
8
|
PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
|
9
|
+
FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
|
9
10
|
|
10
11
|
# Creates a crawler
|
11
12
|
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
12
13
|
def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
|
13
14
|
@denylist_regexp = Regexp.union(denylist)
|
14
15
|
|
15
|
-
@
|
16
|
-
@
|
17
|
-
|
16
|
+
@content_extractor = Vole::Handlers::ContentExtractor.new
|
17
|
+
@selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
|
18
|
+
ext = PLATFORM.include?("windows") ? ".exe" : ""
|
19
|
+
@executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
|
20
|
+
@parent_output_dir = "tmp/vore"
|
18
21
|
|
19
22
|
return if File.exist?(@executable)
|
20
23
|
|
@@ -23,7 +26,7 @@ module Vore
|
|
23
26
|
end
|
24
27
|
|
25
28
|
def scrape_each_page(website, &block)
|
26
|
-
output_dir = "#{@
|
29
|
+
output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
|
27
30
|
Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
|
28
31
|
|
29
32
|
output = %x(#{@executable} \
|
@@ -35,13 +38,23 @@ module Vore
|
|
35
38
|
|
36
39
|
Vore.logger.info("Vore finished crawling #{website}: #{output}")
|
37
40
|
|
38
|
-
Dir.glob("
|
41
|
+
Dir.glob(File.join(output_dir, "**", "*")).each do |path|
|
39
42
|
next unless File.file?(path)
|
40
43
|
|
41
|
-
html_file = File.read(path)
|
44
|
+
html_file = File.read(path).force_encoding("UTF-8")
|
42
45
|
rewritten_html_file = @selma.rewrite(html_file)
|
43
46
|
|
44
|
-
|
47
|
+
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
48
|
+
url_path = path.split(FILE_SEPERATOR)[3..].join("/")
|
49
|
+
|
50
|
+
page = Vore::PageData.new(
|
51
|
+
content: rewritten_html_file,
|
52
|
+
title: @content_extractor.title,
|
53
|
+
meta: @content_extractor.meta,
|
54
|
+
path: url_path,
|
55
|
+
)
|
56
|
+
|
57
|
+
yield page
|
45
58
|
ensure
|
46
59
|
File.delete(path) if File.file?(path)
|
47
60
|
end
|
@@ -3,19 +3,42 @@
|
|
3
3
|
module Vole
|
4
4
|
module Handlers
|
5
5
|
class ContentExtractor
|
6
|
-
SELECTOR = Selma::Selector.new(match_element: "*")
|
6
|
+
SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
|
7
|
+
|
8
|
+
attr_reader :title, :meta
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
super
|
12
|
+
@title = ""
|
13
|
+
@meta = {}
|
14
|
+
@within_title = false
|
15
|
+
end
|
7
16
|
|
8
17
|
def selector
|
9
18
|
SELECTOR
|
10
19
|
end
|
11
20
|
|
12
21
|
def handle_element(element)
|
13
|
-
if element.tag_name == "pre" || element.tag_name == "code"
|
22
|
+
if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
|
23
|
+
element.remove
|
24
|
+
elsif element.tag_name == "title"
|
25
|
+
@within_title = true
|
14
26
|
element.remove
|
27
|
+
elsif element.tag_name == "meta"
|
28
|
+
return if element.attributes["name"].nil?
|
29
|
+
|
30
|
+
@meta[element.attributes["name"]] = element.attributes["content"]
|
15
31
|
else
|
16
32
|
element.remove_and_keep_content
|
17
33
|
end
|
18
34
|
end
|
35
|
+
|
36
|
+
def handle_text_chunk(text)
|
37
|
+
if @within_title
|
38
|
+
@within_title = false
|
39
|
+
@title = text.to_s
|
40
|
+
end
|
41
|
+
end
|
19
42
|
end
|
20
43
|
end
|
21
44
|
end
|
data/lib/vore/version.rb
CHANGED
data/lib/vore.rb
CHANGED
@@ -14,6 +14,9 @@ require_relative "vore/version"
|
|
14
14
|
require_relative "vore/configuration"
|
15
15
|
require_relative "vore/logger"
|
16
16
|
require_relative "vore/crawler"
|
17
|
+
require_relative "vore/page"
|
18
|
+
require_relative "vore/page_data"
|
19
|
+
require_relative "vore/website"
|
17
20
|
|
18
21
|
module Vore
|
19
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
@@ -49,6 +49,7 @@ files:
|
|
49
49
|
- lib/vore/handlers/content_extractor.rb
|
50
50
|
- lib/vore/logger.rb
|
51
51
|
- lib/vore/page.rb
|
52
|
+
- lib/vore/page_data.rb
|
52
53
|
- lib/vore/version.rb
|
53
54
|
- lib/vore/website.rb
|
54
55
|
homepage: https://github.com/gjtorikian/vore
|
@@ -75,5 +76,5 @@ requirements: []
|
|
75
76
|
rubygems_version: 3.5.3
|
76
77
|
signing_key:
|
77
78
|
specification_version: 4
|
78
|
-
summary: Quickly
|
79
|
+
summary: Quickly crawls websites and spits out text sans tags. Powered by Rust.
|
79
80
|
test_files: []
|