vore 0.1.1.4-x86_64-linux → 0.2.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/exe/vore-spider +0 -0
- data/lib/vore/crawler.rb +17 -4
- data/lib/vore/handlers/content_extractor.rb +25 -2
- data/lib/vore/page_data.rb +14 -0
- data/lib/vore/version.rb +1 -1
- data/lib/vore.rb +3 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7427980069e4d526d966f4e94f7f4f0e9f35831c2202d3ce6cdf93d7ed4db4cb
|
4
|
+
data.tar.gz: fd5b467cfa827e9c0b776b03982e5b2a683725b57be3d83c0e06e924634bfe7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b252342c22884e71703595a880dc1d8c3f64daaaecbdf60744245ef8e741441ecb23de978f80531ed4a48d67283f343d2455992d88832ed555bf78c736232e7
|
7
|
+
data.tar.gz: 7b85dc417826abcac6d8d008e1b461aba77857d2fcf8a37953946e6a0b9f9014420270c3635c819e3a698df36a6ab58fba518135784f9b84887c9bf5ba8a2db6
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
![Vore, by LewdBacon](https://github.com/user-attachments/assets/0923cc84-4cca-4d95-8a0e-4dad650525d2)
|
4
4
|
|
5
|
-
Vore
|
5
|
+
Vore quickly crawls websites and spits out text sans tags. It's written in Ruby and powered by Rust.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
data/exe/vore-spider
CHANGED
Binary file
|
data/lib/vore/crawler.rb
CHANGED
@@ -6,14 +6,17 @@ module Vore
|
|
6
6
|
# This is the class that starts and controls the crawling
|
7
7
|
class Crawler
|
8
8
|
PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
|
9
|
+
FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
|
9
10
|
|
10
11
|
# Creates a crawler
|
11
12
|
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
12
13
|
def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
|
13
14
|
@denylist_regexp = Regexp.union(denylist)
|
14
15
|
|
15
|
-
@
|
16
|
-
@
|
16
|
+
@content_extractor = Vole::Handlers::ContentExtractor.new
|
17
|
+
@selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
|
18
|
+
ext = PLATFORM.include?("windows") ? ".exe" : ""
|
19
|
+
@executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
|
17
20
|
@output_dir = "tmp/vore"
|
18
21
|
|
19
22
|
return if File.exist?(@executable)
|
@@ -38,10 +41,20 @@ module Vore
|
|
38
41
|
Dir.glob("tmp/**/*").each do |path|
|
39
42
|
next unless File.file?(path)
|
40
43
|
|
41
|
-
html_file = File.read(path)
|
44
|
+
html_file = File.read(path).force_encoding("UTF-8")
|
42
45
|
rewritten_html_file = @selma.rewrite(html_file)
|
43
46
|
|
44
|
-
|
47
|
+
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
48
|
+
url_path = path.split(FILE_SEPERATOR)[3..].join("/")
|
49
|
+
|
50
|
+
page = Vore::PageData.new(
|
51
|
+
content: rewritten_html_file,
|
52
|
+
title: @content_extractor.title,
|
53
|
+
meta: @content_extractor.meta,
|
54
|
+
path: url_path,
|
55
|
+
)
|
56
|
+
|
57
|
+
yield page
|
45
58
|
ensure
|
46
59
|
File.delete(path) if File.file?(path)
|
47
60
|
end
|
@@ -3,19 +3,42 @@
|
|
3
3
|
module Vole
|
4
4
|
module Handlers
|
5
5
|
class ContentExtractor
|
6
|
-
SELECTOR = Selma::Selector.new(match_element: "*")
|
6
|
+
SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
|
7
|
+
|
8
|
+
attr_reader :title, :meta
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
super
|
12
|
+
@title = ""
|
13
|
+
@meta = {}
|
14
|
+
@within_title = false
|
15
|
+
end
|
7
16
|
|
8
17
|
def selector
|
9
18
|
SELECTOR
|
10
19
|
end
|
11
20
|
|
12
21
|
def handle_element(element)
|
13
|
-
if element.tag_name == "pre" || element.tag_name == "code"
|
22
|
+
if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
|
23
|
+
element.remove
|
24
|
+
elsif element.tag_name == "title"
|
25
|
+
@within_title = true
|
14
26
|
element.remove
|
27
|
+
elsif element.tag_name == "meta"
|
28
|
+
return if element.attributes["name"].nil?
|
29
|
+
|
30
|
+
@meta[element.attributes["name"]] = element.attributes["content"]
|
15
31
|
else
|
16
32
|
element.remove_and_keep_content
|
17
33
|
end
|
18
34
|
end
|
35
|
+
|
36
|
+
def handle_text_chunk(text)
|
37
|
+
if @within_title
|
38
|
+
@within_title = false
|
39
|
+
@title = text.to_s
|
40
|
+
end
|
41
|
+
end
|
19
42
|
end
|
20
43
|
end
|
21
44
|
end
|
data/lib/vore/version.rb
CHANGED
data/lib/vore.rb
CHANGED
@@ -14,6 +14,9 @@ require_relative "vore/version"
|
|
14
14
|
require_relative "vore/configuration"
|
15
15
|
require_relative "vore/logger"
|
16
16
|
require_relative "vore/crawler"
|
17
|
+
require_relative "vore/page"
|
18
|
+
require_relative "vore/page_data"
|
19
|
+
require_relative "vore/website"
|
17
20
|
|
18
21
|
module Vore
|
19
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: x86_64-linux
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
@@ -49,6 +49,7 @@ files:
|
|
49
49
|
- lib/vore/handlers/content_extractor.rb
|
50
50
|
- lib/vore/logger.rb
|
51
51
|
- lib/vore/page.rb
|
52
|
+
- lib/vore/page_data.rb
|
52
53
|
- lib/vore/version.rb
|
53
54
|
- lib/vore/website.rb
|
54
55
|
homepage: https://github.com/gjtorikian/vore
|
@@ -75,5 +76,5 @@ requirements: []
|
|
75
76
|
rubygems_version: 3.5.3
|
76
77
|
signing_key:
|
77
78
|
specification_version: 4
|
78
|
-
summary: Quickly
|
79
|
+
summary: Quickly crawls websites and spits out text sans tags. Powered by Rust.
|
79
80
|
test_files: []
|