vore 0.2.0-arm64-linux

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml ADDED
@@ -0,0 +1,7 @@
1
+ # This Cargo.toml is here to let externals tools (IDEs, etc.) know that this is
2
+ # a Rust project. Your extensions dependencies should be added to the Cargo.toml
3
+ # in the ext/ directory.
4
+
5
+ [workspace]
6
+ members = ["./ext/vore"]
7
+ resolver = "2"
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2024 Garen J. Torikian
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # Vore
2
+
3
+ ![Vore, by LewdBacon](https://github.com/user-attachments/assets/0923cc84-4cca-4d95-8a0e-4dad650525d2)
4
+
5
+ Vore quickly crawls websites and spits out text sans tags. It's written in Ruby and powered by Rust.
6
+
7
+ ## Installation
8
+
9
+
10
+ Install the gem and add to the application's Gemfile by executing:
11
+
12
+ $ bundle add vore
13
+
14
+ If bundler is not being used to manage dependencies, install the gem by executing:
15
+
16
+ $ gem install vore
17
+
18
+ ## Usage
19
+
20
+ ```ruby
21
+ crawler = Vore::Crawler.new
22
+ crawler.scrape_each_page("https://choosealicense.com") do |page|
23
+ puts page
24
+ end
25
+ ```
26
+
27
+ Each `page` is simply every text node. The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), so you know it's fast.
28
+
29
+ ## Development
30
+
31
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
32
+
33
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
34
+
35
+ ## Contributing
36
+
37
+ Bug reports and pull requests are welcome on GitHub at https://github.com/gjtorikian/vore.
38
+
39
+ ## License
40
+
41
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/exe/vore-spider ADDED
Binary file
@@ -0,0 +1,17 @@
1
+ [package]
2
+ name = "vore"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ authors = ["Garen J. Torikian <gjtorikian@users.noreply.github.com>"]
6
+ license = "MIT"
7
+ publish = false
8
+
9
+ [lib]
10
+ crate-type = ["cdylib"]
11
+
12
+ [dependencies]
13
+ magnus = { version = "0.7", features = ["rb-sys"] }
14
+ rb-sys = { version = "*", default-features = false, features = [
15
+ "stable-api-compiled-fallback",
16
+ ] }
17
+ spider_cli = { version = "1.99" }
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+ require "rb_sys/mkmf"
5
+
6
+ create_rust_makefile("vore/vore")
@@ -0,0 +1,13 @@
1
+ use magnus::{Error, Ruby};
2
+
3
+ pub mod page;
4
+ pub mod website;
5
+
6
+ #[magnus::init]
7
+ fn init(ruby: &Ruby) -> Result<(), Error> {
8
+ let m_vore = ruby.define_module("Vore")?;
9
+ website::init(m_vore).expect("cannot define Vore::Website class");
10
+ page::init(m_vore).expect("cannot define Vore::Page class");
11
+
12
+ Ok(())
13
+ }
@@ -0,0 +1,9 @@
1
+ use magnus::{prelude::*, RModule};
2
+
3
+ pub fn init(m_vore: RModule) -> Result<(), magnus::Error> {
4
+ let c_page = m_vore
5
+ .define_class("Page", magnus::class::object())
6
+ .expect("cannot define class Vore::Page");
7
+
8
+ Ok(())
9
+ }
@@ -0,0 +1,40 @@
1
+ use std::{borrow::BorrowMut, cell::RefCell};
2
+
3
+ use magnus::{function, prelude::*, scan_args, Error, RHash, RModule, Ruby, Value};
4
+
5
+ #[derive(Clone)]
6
+ pub struct Website {
7
+ website: String,
8
+ }
9
+
10
+ #[derive(Clone)]
11
+ #[magnus::wrap(class = "Vore::Sanitizer")]
12
+ pub struct VoreWebsite(RefCell<Website>);
13
+
14
+ impl VoreWebsite {
15
+ pub fn new(arguments: &[Value]) -> Result<Self, magnus::Error> {
16
+ let args = scan_args::scan_args::<(String,), (), (), (), (), ()>(arguments)?;
17
+ let (website,): (String,) = args.required;
18
+
19
+ Ok(Self(RefCell::new(Website { website: website })))
20
+ }
21
+
22
+ // pub async fn scrape(&self) -> Result<(), magnus::Error> {
23
+ // let mut binding = self.0.borrow_mut();
24
+
25
+ // binding.website.scrape().await;
26
+
27
+ // Ok(())
28
+ // }
29
+ }
30
+
31
+ pub fn init(m_vore: RModule) -> Result<(), magnus::Error> {
32
+ let c_website = m_vore
33
+ .define_class("Website", magnus::class::object())
34
+ .expect("cannot define class Vore::Website");
35
+
36
+ c_website.define_singleton_method("new", function!(VoreWebsite::new, -1))?;
37
+ // c_website.define_method("scrape!", function!(VoreWebsite::scrape, 1))?;
38
+
39
+ Ok(())
40
+ }
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vole
4
+ class Configuration
5
+ DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
6
+ allow_doctype: false,
7
+ })
8
+ end
9
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "handlers/content_extractor"
4
+
5
+ module Vore
6
+ # This is the class that starts and controls the crawling
7
+ class Crawler
8
+ PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
+ FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
10
+
11
+ # Creates a crawler
12
+ # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
13
+ def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
14
+ @denylist_regexp = Regexp.union(denylist)
15
+
16
+ @content_extractor = Vole::Handlers::ContentExtractor.new
17
+ @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
18
+ ext = PLATFORM.include?("windows") ? ".exe" : ""
19
+ @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
20
+ @output_dir = "tmp/vore"
21
+
22
+ return if File.exist?(@executable)
23
+
24
+ warn("ERROR: Unsupported platform: `#{PLATFORM}`")
25
+ exit(1)
26
+ end
27
+
28
+ def scrape_each_page(website, &block)
29
+ output_dir = "#{@output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
30
+ Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
31
+
32
+ output = %x(#{@executable} \
33
+ --user-agent #{user_agent} \
34
+ --url #{website} \
35
+ download \
36
+ -t \
37
+ #{output_dir})
38
+
39
+ Vore.logger.info("Vore finished crawling #{website}: #{output}")
40
+
41
+ Dir.glob("tmp/**/*").each do |path|
42
+ next unless File.file?(path)
43
+
44
+ html_file = File.read(path).force_encoding("UTF-8")
45
+ rewritten_html_file = @selma.rewrite(html_file)
46
+
47
+ # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
48
+ url_path = path.split(FILE_SEPERATOR)[3..].join("/")
49
+
50
+ page = Vore::PageData.new(
51
+ content: rewritten_html_file,
52
+ title: @content_extractor.title,
53
+ meta: @content_extractor.meta,
54
+ path: url_path,
55
+ )
56
+
57
+ yield page
58
+ ensure
59
+ File.delete(path) if File.file?(path)
60
+ end
61
+ end
62
+
63
+ # def crawl(site, block)
64
+ # Vore.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
65
+ # crawl_site(site)
66
+ # end
67
+
68
+ def user_agent
69
+ "'Mozilla/5.0 (compatible; Vore/#{Vore::VERSION}; +https://github.com/gjtorikian/vore)'"
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vole
4
+ module Handlers
5
+ class ContentExtractor
6
+ SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
7
+
8
+ attr_reader :title, :meta
9
+
10
+ def initialize
11
+ super
12
+ @title = ""
13
+ @meta = {}
14
+ @within_title = false
15
+ end
16
+
17
+ def selector
18
+ SELECTOR
19
+ end
20
+
21
+ def handle_element(element)
22
+ if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
23
+ element.remove
24
+ elsif element.tag_name == "title"
25
+ @within_title = true
26
+ element.remove
27
+ elsif element.tag_name == "meta"
28
+ return if element.attributes["name"].nil?
29
+
30
+ @meta[element.attributes["name"]] = element.attributes["content"]
31
+ else
32
+ element.remove_and_keep_content
33
+ end
34
+ end
35
+
36
+ def handle_text_chunk(text)
37
+ if @within_title
38
+ @within_title = false
39
+ @title = text.to_s
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "logger"
4
+ require "forwardable"
5
+
6
+ module Vore
7
+ class << self
8
+ attr_accessor :logger
9
+ end
10
+
11
+ class Logger
12
+ class << self
13
+ extend Forwardable
14
+ delegate [:debug, :info, :warn, :error] => :instance
15
+
16
+ attr_writer :instance
17
+
18
+ def instance
19
+ @instance ||= begin
20
+ $stdout.sync = true
21
+ instance = ::Logger.new($stdout)
22
+ instance.level = ::Logger::DEBUG
23
+ instance
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ Vore.logger = Vore::Logger
data/lib/vore/page.rb ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ class Page
5
+ end
6
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ class PageData
5
+ attr_reader :title, :meta, :content, :path
6
+
7
+ def initialize(title:, meta:, content:, path:)
8
+ @title = title
9
+ @meta = meta
10
+ @content = content
11
+ @path = path
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ VERSION = "0.2.0"
5
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ class Website
5
+ end
6
+ end
data/lib/vore.rb ADDED
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ if ENV.fetch("DEBUG", false)
4
+ begin
5
+ require "debug"
6
+ require "amazing_print"
7
+ rescue LoadError # rubocop:disable Lint/SuppressedException
8
+ end
9
+ end
10
+
11
+ require "selma"
12
+
13
+ require_relative "vore/version"
14
+ require_relative "vore/configuration"
15
+ require_relative "vore/logger"
16
+ require_relative "vore/crawler"
17
+ require_relative "vore/page"
18
+ require_relative "vore/page_data"
19
+ require_relative "vore/website"
20
+
21
+ module Vore
22
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: vore
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: arm64-linux
6
+ authors:
7
+ - Garen J. Torikian
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-07-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: selma
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.4'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.4'
27
+ force_ruby_platform: false
28
+ description:
29
+ email:
30
+ - gjtorikian@users.noreply.github.com
31
+ executables: []
32
+ extensions:
33
+ - ext/vore/Cargo.toml
34
+ extra_rdoc_files: []
35
+ files:
36
+ - Cargo.lock
37
+ - Cargo.toml
38
+ - LICENSE.txt
39
+ - README.md
40
+ - exe/vore-spider
41
+ - ext/vore/Cargo.toml
42
+ - ext/vore/extconf.rb
43
+ - ext/vore/src/lib.rs
44
+ - ext/vore/src/page.rs
45
+ - ext/vore/src/website.rs
46
+ - lib/vore.rb
47
+ - lib/vore/configuration.rb
48
+ - lib/vore/crawler.rb
49
+ - lib/vore/handlers/content_extractor.rb
50
+ - lib/vore/logger.rb
51
+ - lib/vore/page.rb
52
+ - lib/vore/page_data.rb
53
+ - lib/vore/version.rb
54
+ - lib/vore/website.rb
55
+ homepage: https://github.com/gjtorikian/vore
56
+ licenses:
57
+ - MIT
58
+ metadata:
59
+ homepage_uri: https://github.com/gjtorikian/vore
60
+ source_code_uri: https://github.com/gjtorikian/vore
61
+ post_install_message:
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '3.1'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 3.3.22
75
+ requirements: []
76
+ rubygems_version: 3.5.3
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Quickly crawls websites and spits out text sans tags. Powered by Rust.
80
+ test_files: []