vore 0.1.1.4-x86_64-darwin → 0.2.0-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 81fffac2120b7fe4284124ed91f01c7a9bc34324b60877b0f1d1b2789de9bd82
4
- data.tar.gz: 4c9e4f94494a8ad9bff6613661857f5862679099302b7b1f3725ef79e484e16d
3
+ metadata.gz: 5c54c9d9a3d685f0545ebcec67d0aa30a3a6751d5ae31093197a021d24178040
4
+ data.tar.gz: d961d2d7c2bbcf3fad014e4bf1eea9f5d7a692901dcea50f66c3bdccf64066f9
5
5
  SHA512:
6
- metadata.gz: 187fb799c3db652f85f70fd9676468b1f80efcd95dfd395cbc7b833951d605e55efbced8d1005d405836bae3ba36c87baa6565e761aa3f2b81fe9b9f447f9b07
7
- data.tar.gz: 2346f4bca694fc780284e3e727ece3b3932a53e95d4b437b2b2c370cbbb6a9ba61dc8959d4b6045107d0b4635fbc8fca59b75984363ae6e18ca3d79e6f4f19a4
6
+ metadata.gz: 1a9ae3ef5f6b227c86cb21ecef0377ab7582e96ca562505e83b127333c7cad7579c4e5b1525793cd54a4d97a42c626646fcfae4146b547e00373db339f4b9615
7
+ data.tar.gz: ec1173ad2572e8bf07760360941493045d9e1d39912651cab8403be4418da6d39df42e9bdcd3d2f8d61ddf576324fbb73de7d252ca301a75f83920e77a89de79
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ![Vore, by LewdBacon](https://github.com/user-attachments/assets/0923cc84-4cca-4d95-8a0e-4dad650525d2)
4
4
 
5
- Vore gobbles up webpages and spits out their content.
5
+ Vore quickly crawls websites and spits out text sans tags. It's written in Ruby and powered by Rust.
6
6
 
7
7
  ## Installation
8
8
 
data/exe/vore-spider CHANGED
Binary file
data/lib/vore/crawler.rb CHANGED
@@ -6,14 +6,17 @@ module Vore
6
6
  # This is the class that starts and controls the crawling
7
7
  class Crawler
8
8
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
+ FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
9
10
 
10
11
  # Creates a crawler
11
12
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
12
13
  def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
13
14
  @denylist_regexp = Regexp.union(denylist)
14
15
 
15
- @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [Vole::Handlers::ContentExtractor.new])
16
- @executable = File.expand_path(File.join("exe", "vore-spider"))
16
+ @content_extractor = Vole::Handlers::ContentExtractor.new
17
+ @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
18
+ ext = PLATFORM.include?("windows") ? ".exe" : ""
19
+ @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
17
20
  @output_dir = "tmp/vore"
18
21
 
19
22
  return if File.exist?(@executable)
@@ -38,10 +41,20 @@ module Vore
38
41
  Dir.glob("tmp/**/*").each do |path|
39
42
  next unless File.file?(path)
40
43
 
41
- html_file = File.read(path)
44
+ html_file = File.read(path).force_encoding("UTF-8")
42
45
  rewritten_html_file = @selma.rewrite(html_file)
43
46
 
44
- yield rewritten_html_file
47
+ # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
48
+ url_path = path.split(FILE_SEPERATOR)[3..].join("/")
49
+
50
+ page = Vore::PageData.new(
51
+ content: rewritten_html_file,
52
+ title: @content_extractor.title,
53
+ meta: @content_extractor.meta,
54
+ path: url_path,
55
+ )
56
+
57
+ yield page
45
58
  ensure
46
59
  File.delete(path) if File.file?(path)
47
60
  end
@@ -3,19 +3,42 @@
3
3
  module Vole
4
4
  module Handlers
5
5
  class ContentExtractor
6
- SELECTOR = Selma::Selector.new(match_element: "*")
6
+ SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
7
+
8
+ attr_reader :title, :meta
9
+
10
+ def initialize
11
+ super
12
+ @title = ""
13
+ @meta = {}
14
+ @within_title = false
15
+ end
7
16
 
8
17
  def selector
9
18
  SELECTOR
10
19
  end
11
20
 
12
21
  def handle_element(element)
13
- if element.tag_name == "pre" || element.tag_name == "code"
22
+ if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
23
+ element.remove
24
+ elsif element.tag_name == "title"
25
+ @within_title = true
14
26
  element.remove
27
+ elsif element.tag_name == "meta"
28
+ return if element.attributes["name"].nil?
29
+
30
+ @meta[element.attributes["name"]] = element.attributes["content"]
15
31
  else
16
32
  element.remove_and_keep_content
17
33
  end
18
34
  end
35
+
36
+ def handle_text_chunk(text)
37
+ if @within_title
38
+ @within_title = false
39
+ @title = text.to_s
40
+ end
41
+ end
19
42
  end
20
43
  end
21
44
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ class PageData
5
+ attr_reader :title, :meta, :content, :path
6
+
7
+ def initialize(title:, meta:, content:, path:)
8
+ @title = title
9
+ @meta = meta
10
+ @content = content
11
+ @path = path
12
+ end
13
+ end
14
+ end
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.1.1.4"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/vore.rb CHANGED
@@ -14,6 +14,9 @@ require_relative "vore/version"
14
14
  require_relative "vore/configuration"
15
15
  require_relative "vore/logger"
16
16
  require_relative "vore/crawler"
17
+ require_relative "vore/page"
18
+ require_relative "vore/page_data"
19
+ require_relative "vore/website"
17
20
 
18
21
  module Vore
19
22
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1.4
4
+ version: 0.2.0
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian
@@ -49,6 +49,7 @@ files:
49
49
  - lib/vore/handlers/content_extractor.rb
50
50
  - lib/vore/logger.rb
51
51
  - lib/vore/page.rb
52
+ - lib/vore/page_data.rb
52
53
  - lib/vore/version.rb
53
54
  - lib/vore/website.rb
54
55
  homepage: https://github.com/gjtorikian/vore
@@ -75,5 +76,5 @@ requirements: []
75
76
  rubygems_version: 3.5.3
76
77
  signing_key:
77
78
  specification_version: 4
78
- summary: Quickly consume websites and spit out text. Powered by Rust.
79
+ summary: Quickly crawls websites and spits out text sans tags. Powered by Rust.
79
80
  test_files: []