vore 0.1.1.4-x86_64-darwin → 0.2.1-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 81fffac2120b7fe4284124ed91f01c7a9bc34324b60877b0f1d1b2789de9bd82
4
- data.tar.gz: 4c9e4f94494a8ad9bff6613661857f5862679099302b7b1f3725ef79e484e16d
3
+ metadata.gz: 31497cb9cec566256acca79e43bfb5b6fe8791bea446bba60ca895f2be3ab91b
4
+ data.tar.gz: 2252352ba0f823215117c143d0608431bfd6e0037f9b122f6711c191dc7f99c2
5
5
  SHA512:
6
- metadata.gz: 187fb799c3db652f85f70fd9676468b1f80efcd95dfd395cbc7b833951d605e55efbced8d1005d405836bae3ba36c87baa6565e761aa3f2b81fe9b9f447f9b07
7
- data.tar.gz: 2346f4bca694fc780284e3e727ece3b3932a53e95d4b437b2b2c370cbbb6a9ba61dc8959d4b6045107d0b4635fbc8fca59b75984363ae6e18ca3d79e6f4f19a4
6
+ metadata.gz: a0f51ccfc30cb95ff87b6275c033b343ca7c07994f9937fad18a670d8973dd2c0d2db8c40f5849d2e32a372c1e755e97b3489b46532bd9c0317e7bfdc88786e0
7
+ data.tar.gz: 9517a87119025f30bdb105fa89440990ec260b32db10e497a6775271fec547358a4552edf6033508d27225a12427bd61272b13e0585824802c4490d7187173de
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ![Vore, by LewdBacon](https://github.com/user-attachments/assets/0923cc84-4cca-4d95-8a0e-4dad650525d2)
4
4
 
5
- Vore gobbles up webpages and spits out their content.
5
+ Vore quickly crawls websites and spits out text sans tags. It's written in Ruby and powered by Rust.
6
6
 
7
7
  ## Installation
8
8
 
data/exe/vore-spider CHANGED
Binary file
data/lib/vore/crawler.rb CHANGED
@@ -6,15 +6,18 @@ module Vore
6
6
  # This is the class that starts and controls the crawling
7
7
  class Crawler
8
8
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
+ FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
9
10
 
10
11
  # Creates a crawler
11
12
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
12
13
  def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
13
14
  @denylist_regexp = Regexp.union(denylist)
14
15
 
15
- @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [Vole::Handlers::ContentExtractor.new])
16
- @executable = File.expand_path(File.join("exe", "vore-spider"))
17
- @output_dir = "tmp/vore"
16
+ @content_extractor = Vole::Handlers::ContentExtractor.new
17
+ @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
18
+ ext = PLATFORM.include?("windows") ? ".exe" : ""
19
+ @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
20
+ @parent_output_dir = "tmp/vore"
18
21
 
19
22
  return if File.exist?(@executable)
20
23
 
@@ -23,7 +26,7 @@ module Vore
23
26
  end
24
27
 
25
28
  def scrape_each_page(website, &block)
26
- output_dir = "#{@output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
29
+ output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
27
30
  Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
28
31
 
29
32
  output = %x(#{@executable} \
@@ -35,13 +38,23 @@ module Vore
35
38
 
36
39
  Vore.logger.info("Vore finished crawling #{website}: #{output}")
37
40
 
38
- Dir.glob("tmp/**/*").each do |path|
41
+ Dir.glob(File.join(output_dir, "**", "*")).each do |path|
39
42
  next unless File.file?(path)
40
43
 
41
- html_file = File.read(path)
44
+ html_file = File.read(path).force_encoding("UTF-8")
42
45
  rewritten_html_file = @selma.rewrite(html_file)
43
46
 
44
- yield rewritten_html_file
47
+ # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
48
+ url_path = path.split(FILE_SEPERATOR)[3..].join("/")
49
+
50
+ page = Vore::PageData.new(
51
+ content: rewritten_html_file,
52
+ title: @content_extractor.title,
53
+ meta: @content_extractor.meta,
54
+ path: url_path,
55
+ )
56
+
57
+ yield page
45
58
  ensure
46
59
  File.delete(path) if File.file?(path)
47
60
  end
@@ -3,19 +3,42 @@
3
3
  module Vole
4
4
  module Handlers
5
5
  class ContentExtractor
6
- SELECTOR = Selma::Selector.new(match_element: "*")
6
+ SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
7
+
8
+ attr_reader :title, :meta
9
+
10
+ def initialize
11
+ super
12
+ @title = ""
13
+ @meta = {}
14
+ @within_title = false
15
+ end
7
16
 
8
17
  def selector
9
18
  SELECTOR
10
19
  end
11
20
 
12
21
  def handle_element(element)
13
- if element.tag_name == "pre" || element.tag_name == "code"
22
+ if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
23
+ element.remove
24
+ elsif element.tag_name == "title"
25
+ @within_title = true
14
26
  element.remove
27
+ elsif element.tag_name == "meta"
28
+ return if element.attributes["name"].nil?
29
+
30
+ @meta[element.attributes["name"]] = element.attributes["content"]
15
31
  else
16
32
  element.remove_and_keep_content
17
33
  end
18
34
  end
35
+
36
+ def handle_text_chunk(text)
37
+ if @within_title
38
+ @within_title = false
39
+ @title = text.to_s
40
+ end
41
+ end
19
42
  end
20
43
  end
21
44
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ class PageData
5
+ attr_reader :title, :meta, :content, :path
6
+
7
+ def initialize(title:, meta:, content:, path:)
8
+ @title = title
9
+ @meta = meta
10
+ @content = content
11
+ @path = path
12
+ end
13
+ end
14
+ end
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.1.1.4"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/vore.rb CHANGED
@@ -14,6 +14,9 @@ require_relative "vore/version"
14
14
  require_relative "vore/configuration"
15
15
  require_relative "vore/logger"
16
16
  require_relative "vore/crawler"
17
+ require_relative "vore/page"
18
+ require_relative "vore/page_data"
19
+ require_relative "vore/website"
17
20
 
18
21
  module Vore
19
22
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1.4
4
+ version: 0.2.1
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian
@@ -49,6 +49,7 @@ files:
49
49
  - lib/vore/handlers/content_extractor.rb
50
50
  - lib/vore/logger.rb
51
51
  - lib/vore/page.rb
52
+ - lib/vore/page_data.rb
52
53
  - lib/vore/version.rb
53
54
  - lib/vore/website.rb
54
55
  homepage: https://github.com/gjtorikian/vore
@@ -75,5 +76,5 @@ requirements: []
75
76
  rubygems_version: 3.5.3
76
77
  signing_key:
77
78
  specification_version: 4
78
- summary: Quickly consume websites and spit out text. Powered by Rust.
79
+ summary: Quickly crawls websites and spits out text sans tags. Powered by Rust.
79
80
  test_files: []