vore 0.1.1.3-x86_64-linux → 0.2.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c3a066d2ed47c281d2b9ccf74c5138b36b4e6f85160004d563df4932f6308849
4
- data.tar.gz: 79753c3ff8f62347166b0e8c18d2c237b934226a48565e36b893d999e0fc0c13
3
+ metadata.gz: 7427980069e4d526d966f4e94f7f4f0e9f35831c2202d3ce6cdf93d7ed4db4cb
4
+ data.tar.gz: fd5b467cfa827e9c0b776b03982e5b2a683725b57be3d83c0e06e924634bfe7d
5
5
  SHA512:
6
- metadata.gz: 1805d6b37a00b6b566c1b86fe2ce3b63179f8a59418df1e02702de522a5ddc0ca408171b980931396eb0cccdb493a831e45527186f024ed7839111a44f5c4002
7
- data.tar.gz: 3b63fcb5ead63bda2fc83d4cef0a3a95f82bdf77d4c4835fca8d600c41022a84d17dc26502884d1c81ba743e4d8afdd505adf19ed2ab1b06233455847c08d3fe
6
+ metadata.gz: 1b252342c22884e71703595a880dc1d8c3f64daaaecbdf60744245ef8e741441ecb23de978f80531ed4a48d67283f343d2455992d88832ed555bf78c736232e7
7
+ data.tar.gz: 7b85dc417826abcac6d8d008e1b461aba77857d2fcf8a37953946e6a0b9f9014420270c3635c819e3a698df36a6ab58fba518135784f9b84887c9bf5ba8a2db6
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ![Vore, by LewdBacon](https://github.com/user-attachments/assets/0923cc84-4cca-4d95-8a0e-4dad650525d2)
4
4
 
5
- Vore gobbles up webpages and spits out their content.
5
+ Vore quickly crawls websites and spits out text sans tags. It's written in Ruby and powered by Rust.
6
6
 
7
7
  ## Installation
8
8
 
data/exe/vore-spider CHANGED
Binary file
data/lib/vore/crawler.rb CHANGED
@@ -6,14 +6,17 @@ module Vore
6
6
  # This is the class that starts and controls the crawling
7
7
  class Crawler
8
8
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
+ FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
9
10
 
10
11
  # Creates a crawler
11
12
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
12
13
  def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
13
14
  @denylist_regexp = Regexp.union(denylist)
14
15
 
15
- @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [Vole::Handlers::ContentExtractor.new])
16
- @executable = File.expand_path(File.join("exe", "vore-spider"))
16
+ @content_extractor = Vole::Handlers::ContentExtractor.new
17
+ @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
18
+ ext = PLATFORM.include?("windows") ? ".exe" : ""
19
+ @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
17
20
  @output_dir = "tmp/vore"
18
21
 
19
22
  return if File.exist?(@executable)
@@ -38,10 +41,20 @@ module Vore
38
41
  Dir.glob("tmp/**/*").each do |path|
39
42
  next unless File.file?(path)
40
43
 
41
- html_file = File.read(path)
44
+ html_file = File.read(path).force_encoding("UTF-8")
42
45
  rewritten_html_file = @selma.rewrite(html_file)
43
46
 
44
- yield rewritten_html_file
47
+ # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
48
+ url_path = path.split(FILE_SEPERATOR)[3..].join("/")
49
+
50
+ page = Vore::PageData.new(
51
+ content: rewritten_html_file,
52
+ title: @content_extractor.title,
53
+ meta: @content_extractor.meta,
54
+ path: url_path,
55
+ )
56
+
57
+ yield page
45
58
  ensure
46
59
  File.delete(path) if File.file?(path)
47
60
  end
@@ -3,19 +3,42 @@
3
3
  module Vole
4
4
  module Handlers
5
5
  class ContentExtractor
6
- SELECTOR = Selma::Selector.new(match_element: "*")
6
+ SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
7
+
8
+ attr_reader :title, :meta
9
+
10
+ def initialize
11
+ super
12
+ @title = ""
13
+ @meta = {}
14
+ @within_title = false
15
+ end
7
16
 
8
17
  def selector
9
18
  SELECTOR
10
19
  end
11
20
 
12
21
  def handle_element(element)
13
- if element.tag_name == "pre" || element.tag_name == "code"
22
+ if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
23
+ element.remove
24
+ elsif element.tag_name == "title"
25
+ @within_title = true
14
26
  element.remove
27
+ elsif element.tag_name == "meta"
28
+ return if element.attributes["name"].nil?
29
+
30
+ @meta[element.attributes["name"]] = element.attributes["content"]
15
31
  else
16
32
  element.remove_and_keep_content
17
33
  end
18
34
  end
35
+
36
+ def handle_text_chunk(text)
37
+ if @within_title
38
+ @within_title = false
39
+ @title = text.to_s
40
+ end
41
+ end
19
42
  end
20
43
  end
21
44
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ class PageData
5
+ attr_reader :title, :meta, :content, :path
6
+
7
+ def initialize(title:, meta:, content:, path:)
8
+ @title = title
9
+ @meta = meta
10
+ @content = content
11
+ @path = path
12
+ end
13
+ end
14
+ end
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.1.1.3"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/vore.rb CHANGED
@@ -14,6 +14,9 @@ require_relative "vore/version"
14
14
  require_relative "vore/configuration"
15
15
  require_relative "vore/logger"
16
16
  require_relative "vore/crawler"
17
+ require_relative "vore/page"
18
+ require_relative "vore/page_data"
19
+ require_relative "vore/website"
17
20
 
18
21
  module Vore
19
22
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1.3
4
+ version: 0.2.0
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Garen J. Torikian
@@ -49,6 +49,7 @@ files:
49
49
  - lib/vore/handlers/content_extractor.rb
50
50
  - lib/vore/logger.rb
51
51
  - lib/vore/page.rb
52
+ - lib/vore/page_data.rb
52
53
  - lib/vore/version.rb
53
54
  - lib/vore/website.rb
54
55
  homepage: https://github.com/gjtorikian/vore
@@ -75,5 +76,5 @@ requirements: []
75
76
  rubygems_version: 3.5.3
76
77
  signing_key:
77
78
  specification_version: 4
78
- summary: Quickly consume websites and spit out text. Powered by Rust.
79
+ summary: Quickly crawls websites and spits out text sans tags. Powered by Rust.
79
80
  test_files: []