vore 0.1.1.4-arm64-darwin → 0.2.0-arm64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/exe/vore-spider +0 -0
- data/lib/vore/crawler.rb +17 -4
- data/lib/vore/handlers/content_extractor.rb +25 -2
- data/lib/vore/page_data.rb +14 -0
- data/lib/vore/version.rb +1 -1
- data/lib/vore.rb +3 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d2efc3993d1d65cebb2a5576331bc20ece0520a49f59c0d7f995a179d2b7f4b
|
4
|
+
data.tar.gz: 9bfa5dd808324b2d52b540937f2a3cd7562e453adf79ff3c982ac8fbab4e6339
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a1364b00820082923ac4ca1679564a4e2ee44d89f27539022a1551d7dafca7df93817c71581021ab67ebfb2e044fcc4799ff1abd8f412cdab88c73fc658d52df
|
7
|
+
data.tar.gz: e95deb9ca83978c56bdd96211f806042f3c5907ffff9fcfb90dddc486b3b766f1c207a56fd484a1ee554ae3b26fb89fa7220e5cfa30434a23447bc08ee61bc8f
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
![Vore, by LewdBacon](https://github.com/user-attachments/assets/0923cc84-4cca-4d95-8a0e-4dad650525d2)
|
4
4
|
|
5
|
-
Vore
|
5
|
+
Vore quickly crawls websites and spits out text sans tags. It's written in Ruby and powered by Rust.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
data/exe/vore-spider
CHANGED
Binary file
|
data/lib/vore/crawler.rb
CHANGED
@@ -6,14 +6,17 @@ module Vore
|
|
6
6
|
# This is the class that starts and controls the crawling
|
7
7
|
class Crawler
|
8
8
|
PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
|
9
|
+
FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
|
9
10
|
|
10
11
|
# Creates a crawler
|
11
12
|
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
12
13
|
def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
|
13
14
|
@denylist_regexp = Regexp.union(denylist)
|
14
15
|
|
15
|
-
@
|
16
|
-
@
|
16
|
+
@content_extractor = Vole::Handlers::ContentExtractor.new
|
17
|
+
@selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
|
18
|
+
ext = PLATFORM.include?("windows") ? ".exe" : ""
|
19
|
+
@executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
|
17
20
|
@output_dir = "tmp/vore"
|
18
21
|
|
19
22
|
return if File.exist?(@executable)
|
@@ -38,10 +41,20 @@ module Vore
|
|
38
41
|
Dir.glob("tmp/**/*").each do |path|
|
39
42
|
next unless File.file?(path)
|
40
43
|
|
41
|
-
html_file = File.read(path)
|
44
|
+
html_file = File.read(path).force_encoding("UTF-8")
|
42
45
|
rewritten_html_file = @selma.rewrite(html_file)
|
43
46
|
|
44
|
-
|
47
|
+
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
48
|
+
url_path = path.split(FILE_SEPERATOR)[3..].join("/")
|
49
|
+
|
50
|
+
page = Vore::PageData.new(
|
51
|
+
content: rewritten_html_file,
|
52
|
+
title: @content_extractor.title,
|
53
|
+
meta: @content_extractor.meta,
|
54
|
+
path: url_path,
|
55
|
+
)
|
56
|
+
|
57
|
+
yield page
|
45
58
|
ensure
|
46
59
|
File.delete(path) if File.file?(path)
|
47
60
|
end
|
@@ -3,19 +3,42 @@
|
|
3
3
|
module Vole
|
4
4
|
module Handlers
|
5
5
|
class ContentExtractor
|
6
|
-
SELECTOR = Selma::Selector.new(match_element: "*")
|
6
|
+
SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
|
7
|
+
|
8
|
+
attr_reader :title, :meta
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
super
|
12
|
+
@title = ""
|
13
|
+
@meta = {}
|
14
|
+
@within_title = false
|
15
|
+
end
|
7
16
|
|
8
17
|
def selector
|
9
18
|
SELECTOR
|
10
19
|
end
|
11
20
|
|
12
21
|
def handle_element(element)
|
13
|
-
if element.tag_name == "pre" || element.tag_name == "code"
|
22
|
+
if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
|
23
|
+
element.remove
|
24
|
+
elsif element.tag_name == "title"
|
25
|
+
@within_title = true
|
14
26
|
element.remove
|
27
|
+
elsif element.tag_name == "meta"
|
28
|
+
return if element.attributes["name"].nil?
|
29
|
+
|
30
|
+
@meta[element.attributes["name"]] = element.attributes["content"]
|
15
31
|
else
|
16
32
|
element.remove_and_keep_content
|
17
33
|
end
|
18
34
|
end
|
35
|
+
|
36
|
+
def handle_text_chunk(text)
|
37
|
+
if @within_title
|
38
|
+
@within_title = false
|
39
|
+
@title = text.to_s
|
40
|
+
end
|
41
|
+
end
|
19
42
|
end
|
20
43
|
end
|
21
44
|
end
|
data/lib/vore/version.rb
CHANGED
data/lib/vore.rb
CHANGED
@@ -14,6 +14,9 @@ require_relative "vore/version"
|
|
14
14
|
require_relative "vore/configuration"
|
15
15
|
require_relative "vore/logger"
|
16
16
|
require_relative "vore/crawler"
|
17
|
+
require_relative "vore/page"
|
18
|
+
require_relative "vore/page_data"
|
19
|
+
require_relative "vore/website"
|
17
20
|
|
18
21
|
module Vore
|
19
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: arm64-darwin
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
@@ -49,6 +49,7 @@ files:
|
|
49
49
|
- lib/vore/handlers/content_extractor.rb
|
50
50
|
- lib/vore/logger.rb
|
51
51
|
- lib/vore/page.rb
|
52
|
+
- lib/vore/page_data.rb
|
52
53
|
- lib/vore/version.rb
|
53
54
|
- lib/vore/website.rb
|
54
55
|
homepage: https://github.com/gjtorikian/vore
|
@@ -75,5 +76,5 @@ requirements: []
|
|
75
76
|
rubygems_version: 3.5.3
|
76
77
|
signing_key:
|
77
78
|
specification_version: 4
|
78
|
-
summary: Quickly
|
79
|
+
summary: Quickly crawls websites and spits out text sans tags. Powered by Rust.
|
79
80
|
test_files: []
|