vore 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml ADDED
@@ -0,0 +1,7 @@
1
+ # This Cargo.toml is here to let externals tools (IDEs, etc.) know that this is
2
+ # a Rust project. Your extensions dependencies should be added to the Cargo.toml
3
+ # in the ext/ directory.
4
+
5
+ [workspace]
6
+ members = ["./ext/vore"]
7
+ resolver = "2"
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2024 Garen J. Torikian
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # Vore
2
+
3
+ ![Vore, by LewdBacon](https://github.com/user-attachments/assets/0923cc84-4cca-4d95-8a0e-4dad650525d2)
4
+
5
+ Vore gobbles up webpages and spits out their content.
6
+
7
+ ## Installation
8
+
9
+
10
+ Install the gem and add to the application's Gemfile by executing:
11
+
12
+ $ bundle add vore
13
+
14
+ If bundler is not being used to manage dependencies, install the gem by executing:
15
+
16
+ $ gem install vore
17
+
18
+ ## Usage
19
+
20
+ ```ruby
21
+ crawler = Vore::Crawler.new
22
+ crawler.scrape_each_page("https://choosealicense.com") do |page|
23
+ puts page
24
+ end
25
+ ```
26
+
27
+ Each `page` is simply every text node. The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), so you know it's fast.
28
+
29
+ ## Development
30
+
31
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
32
+
33
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
34
+
35
+ ## Contributing
36
+
37
+ Bug reports and pull requests are welcome on GitHub at https://github.com/gjtorikian/vore.
38
+
39
+ ## License
40
+
41
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,17 @@
1
+ [package]
2
+ name = "vore"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ authors = ["Garen J. Torikian <gjtorikian@users.noreply.github.com>"]
6
+ license = "MIT"
7
+ publish = false
8
+
9
+ [lib]
10
+ crate-type = ["cdylib"]
11
+
12
+ [dependencies]
13
+ magnus = { version = "0.7", features = ["rb-sys"] }
14
+ rb-sys = { version = "*", default-features = false, features = [
15
+ "stable-api-compiled-fallback",
16
+ ] }
17
+ spider_cli = { version = "1.99" }
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+ require "rb_sys/mkmf"
5
+
6
+ create_rust_makefile("vore/vore")
@@ -0,0 +1,13 @@
1
+ use magnus::{Error, Ruby};
2
+
3
+ pub mod page;
4
+ pub mod website;
5
+
6
+ #[magnus::init]
7
+ fn init(ruby: &Ruby) -> Result<(), Error> {
8
+ let m_vore = ruby.define_module("Vore")?;
9
+ website::init(m_vore).expect("cannot define Vore::Website class");
10
+ page::init(m_vore).expect("cannot define Vore::Page class");
11
+
12
+ Ok(())
13
+ }
@@ -0,0 +1,9 @@
1
+ use magnus::{prelude::*, RModule};
2
+
3
+ pub fn init(m_vore: RModule) -> Result<(), magnus::Error> {
4
+ let c_page = m_vore
5
+ .define_class("Page", magnus::class::object())
6
+ .expect("cannot define class Vore::Page");
7
+
8
+ Ok(())
9
+ }
@@ -0,0 +1,40 @@
1
+ use std::{borrow::BorrowMut, cell::RefCell};
2
+
3
+ use magnus::{function, prelude::*, scan_args, Error, RHash, RModule, Ruby, Value};
4
+
5
+ #[derive(Clone)]
6
+ pub struct Website {
7
+ website: String,
8
+ }
9
+
10
+ #[derive(Clone)]
11
+ #[magnus::wrap(class = "Vore::Sanitizer")]
12
+ pub struct VoreWebsite(RefCell<Website>);
13
+
14
+ impl VoreWebsite {
15
+ pub fn new(arguments: &[Value]) -> Result<Self, magnus::Error> {
16
+ let args = scan_args::scan_args::<(String,), (), (), (), (), ()>(arguments)?;
17
+ let (website,): (String,) = args.required;
18
+
19
+ Ok(Self(RefCell::new(Website { website: website })))
20
+ }
21
+
22
+ // pub async fn scrape(&self) -> Result<(), magnus::Error> {
23
+ // let mut binding = self.0.borrow_mut();
24
+
25
+ // binding.website.scrape().await;
26
+
27
+ // Ok(())
28
+ // }
29
+ }
30
+
31
+ pub fn init(m_vore: RModule) -> Result<(), magnus::Error> {
32
+ let c_website = m_vore
33
+ .define_class("Website", magnus::class::object())
34
+ .expect("cannot define class Vore::Website");
35
+
36
+ c_website.define_singleton_method("new", function!(VoreWebsite::new, -1))?;
37
+ // c_website.define_method("scrape!", function!(VoreWebsite::scrape, 1))?;
38
+
39
+ Ok(())
40
+ }
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vole
4
+ class Configuration
5
+ DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
6
+ allow_doctype: false,
7
+ })
8
+ end
9
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "handlers/content_extractor"
4
+
5
+ module Vore
6
+ # This is the class that starts and controls the crawling
7
+ class Crawler
8
+ PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
+
10
+ # Creates a crawler
11
+ # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
12
+ def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
13
+ @denylist_regexp = Regexp.union(denylist)
14
+
15
+ @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [Vole::Handlers::ContentExtractor.new])
16
+ @executable = File.expand_path(File.join("exe", "vore-spider"))
17
+ @output_dir = "tmp/vore"
18
+
19
+ return if File.exist?(@executable)
20
+
21
+ warn("ERROR: Unsupported platform: `#{PLATFORM}`")
22
+ exit(1)
23
+ end
24
+
25
+ def scrape_each_page(website, &block)
26
+ output_dir = "#{@output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
27
+ Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
28
+
29
+ output = %x(#{@executable} \
30
+ --user-agent #{user_agent} \
31
+ --url #{website} \
32
+ download \
33
+ -t \
34
+ #{output_dir})
35
+
36
+ Vore.logger.info("Vore finished crawling #{website}: #{output}")
37
+
38
+ Dir.glob("tmp/**/*").each do |path|
39
+ next unless File.file?(path)
40
+
41
+ html_file = File.read(path)
42
+ rewritten_html_file = @selma.rewrite(html_file)
43
+
44
+ yield rewritten_html_file
45
+ ensure
46
+ File.delete(path) if File.file?(path)
47
+ end
48
+ end
49
+
50
+ # def crawl(site, block)
51
+ # Vore.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
52
+ # crawl_site(site)
53
+ # end
54
+
55
+ def user_agent
56
+ "'Mozilla/5.0 (compatible; Vore/#{Vore::VERSION}; +https://github.com/gjtorikian/vore)'"
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vole
4
+ module Handlers
5
+ class ContentExtractor
6
+ SELECTOR = Selma::Selector.new(match_element: "*")
7
+
8
+ def selector
9
+ SELECTOR
10
+ end
11
+
12
+ def handle_element(element)
13
+ if element.tag_name == "pre" || element.tag_name == "code"
14
+ element.remove
15
+ else
16
+ element.remove_and_keep_content
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "logger"
4
+ require "forwardable"
5
+
6
+ module Vore
7
+ class << self
8
+ attr_accessor :logger
9
+ end
10
+
11
+ class Logger
12
+ class << self
13
+ extend Forwardable
14
+ delegate [:debug, :info, :warn, :error] => :instance
15
+
16
+ attr_writer :instance
17
+
18
+ def instance
19
+ @instance ||= begin
20
+ $stdout.sync = true
21
+ instance = ::Logger.new($stdout)
22
+ instance.level = ::Logger::DEBUG
23
+ instance
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ Vore.logger = Vore::Logger
data/lib/vore/page.rb ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ class Page
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ VERSION = "0.1.1"
5
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ class Website
5
+ end
6
+ end
data/lib/vore.rb ADDED
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ if ENV.fetch("DEBUG", false)
4
+ begin
5
+ require "debug"
6
+ require "amazing_print"
7
+ rescue LoadError # rubocop:disable Lint/SuppressedException
8
+ end
9
+ end
10
+
11
+ require "selma"
12
+
13
+ require_relative "vore/version"
14
+ require_relative "vore/configuration"
15
+ require_relative "vore/logger"
16
+ require_relative "vore/crawler"
17
+
18
+ module Vore
19
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: vore
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Garen J. Torikian
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-07-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: selma
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.4'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.4'
27
+ force_ruby_platform: false
28
+ description:
29
+ email:
30
+ - gjtorikian@users.noreply.github.com
31
+ executables: []
32
+ extensions:
33
+ - ext/vore/Cargo.toml
34
+ extra_rdoc_files: []
35
+ files:
36
+ - Cargo.lock
37
+ - Cargo.toml
38
+ - LICENSE.txt
39
+ - README.md
40
+ - ext/vore/Cargo.toml
41
+ - ext/vore/extconf.rb
42
+ - ext/vore/src/lib.rs
43
+ - ext/vore/src/page.rs
44
+ - ext/vore/src/website.rs
45
+ - lib/vore.rb
46
+ - lib/vore/configuration.rb
47
+ - lib/vore/crawler.rb
48
+ - lib/vore/handlers/content_extractor.rb
49
+ - lib/vore/logger.rb
50
+ - lib/vore/page.rb
51
+ - lib/vore/version.rb
52
+ - lib/vore/website.rb
53
+ homepage: https://github.com/gjtorikian/vore
54
+ licenses:
55
+ - MIT
56
+ metadata:
57
+ homepage_uri: https://github.com/gjtorikian/vore
58
+ source_code_uri: https://github.com/gjtorikian/vore
59
+ post_install_message:
60
+ rdoc_options: []
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.1'
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 3.3.22
73
+ requirements: []
74
+ rubygems_version: 3.5.3
75
+ signing_key:
76
+ specification_version: 4
77
+ summary: Quickly consume websites and spit out text. Powered by Rust.
78
+ test_files: []