vore 0.1.1.4-x86_64-darwin → 0.2.0-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/exe/vore-spider +0 -0
- data/lib/vore/crawler.rb +17 -4
- data/lib/vore/handlers/content_extractor.rb +25 -2
- data/lib/vore/page_data.rb +14 -0
- data/lib/vore/version.rb +1 -1
- data/lib/vore.rb +3 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5c54c9d9a3d685f0545ebcec67d0aa30a3a6751d5ae31093197a021d24178040
|
4
|
+
data.tar.gz: d961d2d7c2bbcf3fad014e4bf1eea9f5d7a692901dcea50f66c3bdccf64066f9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a9ae3ef5f6b227c86cb21ecef0377ab7582e96ca562505e83b127333c7cad7579c4e5b1525793cd54a4d97a42c626646fcfae4146b547e00373db339f4b9615
|
7
|
+
data.tar.gz: ec1173ad2572e8bf07760360941493045d9e1d39912651cab8403be4418da6d39df42e9bdcd3d2f8d61ddf576324fbb73de7d252ca301a75f83920e77a89de79
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|

|
4
4
|
|
5
|
-
Vore
|
5
|
+
Vore quickly crawls websites and spits out text sans tags. It's written in Ruby and powered by Rust.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
data/exe/vore-spider
CHANGED
Binary file
|
data/lib/vore/crawler.rb
CHANGED
@@ -6,14 +6,17 @@ module Vore
|
|
6
6
|
# This is the class that starts and controls the crawling
|
7
7
|
class Crawler
|
8
8
|
PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
|
9
|
+
FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
|
9
10
|
|
10
11
|
# Creates a crawler
|
11
12
|
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
12
13
|
def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
|
13
14
|
@denylist_regexp = Regexp.union(denylist)
|
14
15
|
|
15
|
-
@
|
16
|
-
@
|
16
|
+
@content_extractor = Vole::Handlers::ContentExtractor.new
|
17
|
+
@selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
|
18
|
+
ext = PLATFORM.include?("windows") ? ".exe" : ""
|
19
|
+
@executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
|
17
20
|
@output_dir = "tmp/vore"
|
18
21
|
|
19
22
|
return if File.exist?(@executable)
|
@@ -38,10 +41,20 @@ module Vore
|
|
38
41
|
Dir.glob("tmp/**/*").each do |path|
|
39
42
|
next unless File.file?(path)
|
40
43
|
|
41
|
-
html_file = File.read(path)
|
44
|
+
html_file = File.read(path).force_encoding("UTF-8")
|
42
45
|
rewritten_html_file = @selma.rewrite(html_file)
|
43
46
|
|
44
|
-
|
47
|
+
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
48
|
+
url_path = path.split(FILE_SEPERATOR)[3..].join("/")
|
49
|
+
|
50
|
+
page = Vore::PageData.new(
|
51
|
+
content: rewritten_html_file,
|
52
|
+
title: @content_extractor.title,
|
53
|
+
meta: @content_extractor.meta,
|
54
|
+
path: url_path,
|
55
|
+
)
|
56
|
+
|
57
|
+
yield page
|
45
58
|
ensure
|
46
59
|
File.delete(path) if File.file?(path)
|
47
60
|
end
|
@@ -3,19 +3,42 @@
|
|
3
3
|
module Vole
|
4
4
|
module Handlers
|
5
5
|
class ContentExtractor
|
6
|
-
SELECTOR = Selma::Selector.new(match_element: "*")
|
6
|
+
SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
|
7
|
+
|
8
|
+
attr_reader :title, :meta
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
super
|
12
|
+
@title = ""
|
13
|
+
@meta = {}
|
14
|
+
@within_title = false
|
15
|
+
end
|
7
16
|
|
8
17
|
def selector
|
9
18
|
SELECTOR
|
10
19
|
end
|
11
20
|
|
12
21
|
def handle_element(element)
|
13
|
-
if element.tag_name == "pre" || element.tag_name == "code"
|
22
|
+
if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
|
23
|
+
element.remove
|
24
|
+
elsif element.tag_name == "title"
|
25
|
+
@within_title = true
|
14
26
|
element.remove
|
27
|
+
elsif element.tag_name == "meta"
|
28
|
+
return if element.attributes["name"].nil?
|
29
|
+
|
30
|
+
@meta[element.attributes["name"]] = element.attributes["content"]
|
15
31
|
else
|
16
32
|
element.remove_and_keep_content
|
17
33
|
end
|
18
34
|
end
|
35
|
+
|
36
|
+
def handle_text_chunk(text)
|
37
|
+
if @within_title
|
38
|
+
@within_title = false
|
39
|
+
@title = text.to_s
|
40
|
+
end
|
41
|
+
end
|
19
42
|
end
|
20
43
|
end
|
21
44
|
end
|
data/lib/vore/version.rb
CHANGED
data/lib/vore.rb
CHANGED
@@ -14,6 +14,9 @@ require_relative "vore/version"
|
|
14
14
|
require_relative "vore/configuration"
|
15
15
|
require_relative "vore/logger"
|
16
16
|
require_relative "vore/crawler"
|
17
|
+
require_relative "vore/page"
|
18
|
+
require_relative "vore/page_data"
|
19
|
+
require_relative "vore/website"
|
17
20
|
|
18
21
|
module Vore
|
19
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Garen J. Torikian
|
@@ -49,6 +49,7 @@ files:
|
|
49
49
|
- lib/vore/handlers/content_extractor.rb
|
50
50
|
- lib/vore/logger.rb
|
51
51
|
- lib/vore/page.rb
|
52
|
+
- lib/vore/page_data.rb
|
52
53
|
- lib/vore/version.rb
|
53
54
|
- lib/vore/website.rb
|
54
55
|
homepage: https://github.com/gjtorikian/vore
|
@@ -75,5 +76,5 @@ requirements: []
|
|
75
76
|
rubygems_version: 3.5.3
|
76
77
|
signing_key:
|
77
78
|
specification_version: 4
|
78
|
-
summary: Quickly
|
79
|
+
summary: Quickly crawls websites and spits out text sans tags. Powered by Rust.
|
79
80
|
test_files: []
|