vore 0.1.1.3-x86_64-darwin → 0.2.0-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bafdb11e728ed5e2972d35e509b2b977773e5d96c01f9ab115ad6ce153a83474
4
- data.tar.gz: 9f53f326117c7c7281257d4138ce2458dbbd69e3ee9db4b3007ea976ac9e2091
3
+ metadata.gz: 5c54c9d9a3d685f0545ebcec67d0aa30a3a6751d5ae31093197a021d24178040
4
+ data.tar.gz: d961d2d7c2bbcf3fad014e4bf1eea9f5d7a692901dcea50f66c3bdccf64066f9
5
5
  SHA512:
6
- metadata.gz: e8c324ffd623dcbf83124d87f8e427b19d8e86b7e072ce2d2ace9c9406ab8006c738d03bc0fea507f7f91802075c2b66c8238a754472c24fe31254002e16b0a4
7
- data.tar.gz: b980f1990f71b2d9315d8dfee3f9385c781f06bdf173a691fcab96f53b8a070b75bf231b12d7504cc69a8ea59284f4249cc838add339d931be0075ef7874841a
6
+ metadata.gz: 1a9ae3ef5f6b227c86cb21ecef0377ab7582e96ca562505e83b127333c7cad7579c4e5b1525793cd54a4d97a42c626646fcfae4146b547e00373db339f4b9615
7
+ data.tar.gz: ec1173ad2572e8bf07760360941493045d9e1d39912651cab8403be4418da6d39df42e9bdcd3d2f8d61ddf576324fbb73de7d252ca301a75f83920e77a89de79
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ![Vore, by LewdBacon](https://github.com/user-attachments/assets/0923cc84-4cca-4d95-8a0e-4dad650525d2)
4
4
 
5
- Vore gobbles up webpages and spits out their content.
5
+ Vore quickly crawls websites and spits out text sans tags. It's written in Ruby and powered by Rust.
6
6
 
7
7
  ## Installation
8
8
 
data/exe/vore-spider CHANGED
Binary file
data/lib/vore/crawler.rb CHANGED
@@ -6,14 +6,17 @@ module Vore
6
6
  # This is the class that starts and controls the crawling
7
7
  class Crawler
8
8
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
+ FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
9
10
 
10
11
  # Creates a crawler
11
12
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
12
13
  def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
13
14
  @denylist_regexp = Regexp.union(denylist)
14
15
 
15
- @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [Vole::Handlers::ContentExtractor.new])
16
- @executable = File.expand_path(File.join("exe", "vore-spider"))
16
+ @content_extractor = Vole::Handlers::ContentExtractor.new
17
+ @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
18
+ ext = PLATFORM.include?("windows") ? ".exe" : ""
19
+ @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
17
20
  @output_dir = "tmp/vore"
18
21
 
19
22
  return if File.exist?(@executable)
@@ -38,10 +41,20 @@ module Vore
38
41
  Dir.glob("tmp/**/*").each do |path|
39
42
  next unless File.file?(path)
40
43
 
41
- html_file = File.read(path)
44
+ html_file = File.read(path).force_encoding("UTF-8")
42
45
  rewritten_html_file = @selma.rewrite(html_file)
43
46
 
44
- yield rewritten_html_file
47
+ # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
48
+ url_path = path.split(FILE_SEPERATOR)[3..].join("/")
49
+
50
+ page = Vore::PageData.new(
51
+ content: rewritten_html_file,
52
+ title: @content_extractor.title,
53
+ meta: @content_extractor.meta,
54
+ path: url_path,
55
+ )
56
+
57
+ yield page
45
58
  ensure
46
59
  File.delete(path) if File.file?(path)
47
60
  end
@@ -3,19 +3,42 @@
3
3
  module Vole
4
4
  module Handlers
5
5
  class ContentExtractor
6
- SELECTOR = Selma::Selector.new(match_element: "*")
6
+ SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
7
+
8
+ attr_reader :title, :meta
9
+
10
+ def initialize
11
+ super
12
+ @title = ""
13
+ @meta = {}
14
+ @within_title = false
15
+ end
7
16
 
8
17
  def selector
9
18
  SELECTOR
10
19
  end
11
20
 
12
21
  def handle_element(element)
13
- if element.tag_name == "pre" || element.tag_name == "code"
22
+ if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
23
+ element.remove
24
+ elsif element.tag_name == "title"
25
+ @within_title = true
14
26
  element.remove
27
+ elsif element.tag_name == "meta"
28
+ return if element.attributes["name"].nil?
29
+
30
+ @meta[element.attributes["name"]] = element.attributes["content"]
15
31
  else
16
32
  element.remove_and_keep_content
17
33
  end
18
34
  end
35
+
36
+ def handle_text_chunk(text)
37
+ if @within_title
38
+ @within_title = false
39
+ @title = text.to_s
40
+ end
41
+ end
19
42
  end
20
43
  end
21
44
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ class PageData
5
+ attr_reader :title, :meta, :content, :path
6
+
7
+ def initialize(title:, meta:, content:, path:)
8
+ @title = title
9
+ @meta = meta
10
+ @content = content
11
+ @path = path
12
+ end
13
+ end
14
+ end
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.1.1.3"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/vore.rb CHANGED
@@ -14,6 +14,9 @@ require_relative "vore/version"
14
14
  require_relative "vore/configuration"
15
15
  require_relative "vore/logger"
16
16
  require_relative "vore/crawler"
17
+ require_relative "vore/page"
18
+ require_relative "vore/page_data"
19
+ require_relative "vore/website"
17
20
 
18
21
  module Vore
19
22
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1.3
4
+ version: 0.2.0
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian
@@ -49,6 +49,7 @@ files:
49
49
  - lib/vore/handlers/content_extractor.rb
50
50
  - lib/vore/logger.rb
51
51
  - lib/vore/page.rb
52
+ - lib/vore/page_data.rb
52
53
  - lib/vore/version.rb
53
54
  - lib/vore/website.rb
54
55
  homepage: https://github.com/gjtorikian/vore
@@ -75,5 +76,5 @@ requirements: []
75
76
  rubygems_version: 3.5.3
76
77
  signing_key:
77
78
  specification_version: 4
78
- summary: Quickly consume websites and spit out text. Powered by Rust.
79
+ summary: Quickly crawls websites and spits out text sans tags. Powered by Rust.
79
80
  test_files: []