RubyGems - spiderman - Versions diffs - 2.0.0 - Mend

spiderman 2.0.0

Files changed (20) hide show

checksums.yaml +7 -0
data/.gitignore +12 -0
data/.rspec +3 -0
data/.travis.yml +6 -0
data/Gemfile +7 -0
data/Gemfile.lock +85 -0
data/LICENSE.txt +21 -0
data/README.md +86 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/http/acts_as_nokogiri_document.rb +13 -0
data/lib/http/mime_type/html.rb +18 -0
data/lib/spiderman.rb +121 -0
data/lib/spiderman/railtie.rb +11 -0
data/lib/spiderman/runner.rb +47 -0
data/lib/spiderman/tasks.rake +36 -0
data/lib/spiderman/version.rb +3 -0
data/spiderman.gemspec +33 -0
metadata +107 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: ba6dbe7099be528ca38656fe766a6a836fb4e15205dc5bda3c7564580ca6bd26
+  data.tar.gz: d6b49ceaf38b77d19066f8eb9a17e2c4a081fda6b98a53e10d19af82343ddb98
+SHA512:
+  metadata.gz: 9aa9ebbbb420617cef05a71a1824a128e15eaca34aba94aa2e1972d9a9d1f972fbf517e24bec6cc5beea0f9acf859c844771ed55f9130c393fec834e1a13168d
+  data.tar.gz: 528743c4fb28e391749b954d5ad8698b7fdac8aaff8438ced572b491e9986126bb15853aa6cb5eeb2af1e77f6b1b19587de58298abc99480aae80f24c9046117

data/.gitignore ADDED

@@ -0,0 +1,12 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+# rspec failure tracking
+.rspec_status
+vendor/gems

data/.rspec ADDED

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.travis.yml ADDED

@@ -0,0 +1,6 @@
+---
+language: ruby
+cache: bundler
+rvm:
+  - 2.6.5
+before_install: gem install bundler -v 2.1.4

data/Gemfile ADDED

@@ -0,0 +1,7 @@
+source "https://rubygems.org"
+gemspec
+gem "rake", "~> 12.0"
+gem "rspec", "~> 3.0"
+gem "webmock"

data/Gemfile.lock ADDED

@@ -0,0 +1,85 @@
+PATH
+  remote: .
+  specs:
+    spiderman (2.0.0)
+      activesupport (>= 5.0)
+      http (~> 4.0)
+      nokogiri (~> 1.10)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activesupport (6.0.2.2)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 0.7, < 2)
+      minitest (~> 5.1)
+      tzinfo (~> 1.1)
+      zeitwerk (~> 2.2)
+    addressable (2.7.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    concurrent-ruby (1.1.6)
+    crack (0.4.3)
+      safe_yaml (~> 1.0.0)
+    diff-lcs (1.3)
+    domain_name (0.5.20190701)
+      unf (>= 0.0.5, < 1.0.0)
+    ffi (1.12.2)
+    ffi-compiler (1.0.1)
+      ffi (>= 1.0.0)
+      rake
+    hashdiff (1.0.1)
+    http (4.3.0)
+      addressable (~> 2.3)
+      http-cookie (~> 1.0)
+      http-form_data (~> 2.2)
+      http-parser (~> 1.2.0)
+    http-cookie (1.0.3)
+      domain_name (~> 0.5)
+    http-form_data (2.3.0)
+    http-parser (1.2.1)
+      ffi-compiler (>= 1.0, < 2.0)
+    i18n (1.8.2)
+      concurrent-ruby (~> 1.0)
+    mini_portile2 (2.4.0)
+    minitest (5.14.0)
+    nokogiri (1.10.9)
+      mini_portile2 (~> 2.4.0)
+    public_suffix (4.0.3)
+    rake (12.3.3)
+    rspec (3.9.0)
+      rspec-core (~> 3.9.0)
+      rspec-expectations (~> 3.9.0)
+      rspec-mocks (~> 3.9.0)
+    rspec-core (3.9.1)
+      rspec-support (~> 3.9.1)
+    rspec-expectations (3.9.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.9.0)
+    rspec-mocks (3.9.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.9.0)
+    rspec-support (3.9.2)
+    safe_yaml (1.0.5)
+    thread_safe (0.3.6)
+    tzinfo (1.2.6)
+      thread_safe (~> 0.1)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.6)
+    webmock (3.8.3)
+      addressable (>= 2.3.6)
+      crack (>= 0.3.2)
+      hashdiff (>= 0.4.0, < 2.0.0)
+    zeitwerk (2.3.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  rake (~> 12.0)
+  rspec (~> 3.0)
+  spiderman!
+  webmock
+BUNDLED WITH
+   2.1.4

data/LICENSE.txt ADDED

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2020 Brandon Keepers
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,86 @@
+<div align="center">
+  <h1><img width="300" height="300" src="https://user-images.githubusercontent.com/173/77249168-99488080-6c15-11ea-98de-3d14a412265d.png" alt="Spiderman"></h1>
+  <h2>your friendly neighborhood web crawler</h2>
+</div>
+Spiderman is a Ruby gem for crawling and processing web pages.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'spiderman'
+```
+And then execute:
+    $ bundle install
+Or install it yourself as:
+    $ gem install spiderman
+## Usage
+```ruby
+class HackerNewsCrawler
+ include Spiderman
+ crawl "https://news.ycombinator.com/" do |response|
+   response.css('a.storylink').each do |a|
+     process! a["href"], :story
+   end
+ end
+ process :story do |response|
+   logging.info "#{response.uri} #{response.css('title').text}"
+   save_page(response)
+ end
+ def save_page(page)
+   # logic here for saving the page
+ end
+end
+```
+Run the crawler:
+```ruby
+HackerNewsCrawler.crawl!
+```
+### ActiveJob
+Spiderman works with [ActiveJob](https://edgeguides.rubyonrails.org/active_job_basics.html) out of the box. If your crawler class inherits from `ActiveJob:Base`, then requests will be made in your background worker. Each request will run as a separate job.
+```ruby
+class MyCrawer < ActiveJob::Base
+  queue_as :crawler
+  crawl "https://example.com" do |response|
+    response.css('a').each {|a| process! a["href"], :link }
+  end
+  process :link do |response|
+    logger.info "Processing #{response.uri}"
+  end
+end
+```
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/bkeepers/spiderman.
+## License
+The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).

data/Rakefile ADDED

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "spiderman"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start(__FILE__)

data/bin/setup ADDED

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/lib/http/acts_as_nokogiri_document.rb ADDED

@@ -0,0 +1,13 @@
+module HTTP
+  # Module to mix into `HTTP::Response` to make it act like a Nokogiri doc
+  module ActAsNokogiriDocument
+    def document
+      return @document if defined?(@document)
+      @document = parse(content_type.mime_type)
+    end
+    def method_missing(method, *args, &block)
+      document.send(method, *args, &block)
+    end
+  end
+end

data/lib/http/mime_type/html.rb ADDED

@@ -0,0 +1,18 @@
+module HTTP
+  module MimeType
+    # This allows you to call `response.parse` and get back a Nokogiri object
+    # if the content type is HTML.
+    class HTML < Adapter
+      def encode(obj)
+        obj.to_s if obj
+      end
+      def decode(str)
+        Nokogiri::HTML(str)
+      end
+    end
+    register_adapter "text/html", HTML
+    register_alias   "text/html", :html
+  end
+end

data/lib/spiderman.rb ADDED

@@ -0,0 +1,121 @@
+require "logger"
+require "http"
+require "http/mime_type/html"
+require "http/acts_as_nokogiri_document"
+require "active_support/core_ext/class"
+require "active_support/core_ext/module"
+require "active_support/concern"
+require "spiderman/version"
+require "spiderman/runner"
+require 'spiderman/railtie' if defined?(Rails)
+# Turn any class into a crawler by including this module.
+#
+# Example:
+#
+#   class MySpider < ApplicationJob # Yup, you can define this in a job
+#     queue_as :crawler
+#
+#     include Spiderman
+#
+#     crawl "https://example.com/" do |response|
+#       response.css('.selector a').each do |a|
+#         process! a["href"], :listing
+#       end
+#     end
+#
+#     process :listing do |response|
+#       process! response.css('img'), :image
+#       save_the_thing response.css('.some_selector')
+#     end
+#
+#     process :image do |response|
+#       # Do something with the image file
+#     end
+#
+#     def save_the_thing(thing)
+#       # logic here for saving the thing
+#     end
+#  end
+#
+module Spiderman
+  extend ActiveSupport::Concern
+  included do
+    Spiderman.add(self)
+    class_attribute :crawler, instance_reader: true, default: Runner.new
+    delegate :logger, to: :crawler
+  end
+  class_methods do
+    delegate :crawl!, :process!, to: :new
+    # Use `crawl` to specify URLs to start with. `crawl` accepts one or more
+    # URLs, and will call the block for each URL requested. You can also
+    # define multiple `crawl` blocks with different behavior for each
+    # starting URL. All `crawl` blocks will be called when calling
+    # `SpiderName.crawl!`.
+    #
+    # `response` is an enhanced `HTTP::Response` object that also acts like a
+    # `Nokogiri::HTML` document, e.g. `response.css(…)`
+    def crawl(*urls, &block)
+      urls.each { |url| crawler.register(url, &block) }
+      crawler.start_at(*urls)
+    end
+    # Processors are called from `crawl` and can be used to handle different
+    # types of responsezs.
+    def process(type, &block)
+      crawler.register(type, &block)
+    end
+    def inherited(subclass)
+      subclass.crawler = crawler.dup
+      Spiderman.add(subclass)
+    end
+  end
+  def crawl!
+    crawler.urls.each do |url|
+      process! url
+    end
+  end
+  def process!(url, with = nil)
+    if defined?(ActiveJob) && self.is_a?(ActiveJob::Base)
+      self.class.perform_later(url.to_s, with)
+    else
+      perform(url, with)
+    end
+  end
+  def perform(url, with = nil)
+    handler = crawler.handler_for(with || url)
+    response = crawler.request(url)
+    instance_exec response, &handler
+  end
+  def name
+    self.class.name.demodulize
+  end
+  module_function
+  def list
+    @list ||= []
+  end
+  def run(crawler = nil)
+    crawlers = crawler ? [find(crawler)] : list
+    crawlers.each(&:crawl!)
+  end
+  def find(name)
+    self.list.detect { |crawler| crawler.name.demodulize.underscore == name }
+  end
+  def add(clazz)
+    list.push(clazz)
+  end
+end

data/lib/spiderman/railtie.rb ADDED

@@ -0,0 +1,11 @@
+module Spiderman
+  class Railtie < Rails::Railtie
+    initializer "spiderman" do
+      Spiderman::Runner.logger = Rails.logger
+    end
+    rake_tasks do
+      load "spiderman/tasks.rake"
+    end
+  end
+end

data/lib/spiderman/runner.rb ADDED

@@ -0,0 +1,47 @@
+module Spiderman
+  class Runner
+    class_attribute :logger, instance_accessor: true, default: Logger.new(STDOUT, level: :info)
+    attr_reader :urls, :headers, :handlers
+    def initialize
+      @urls = []
+      @handlers = {}
+      @headers = {}
+    end
+    def start_at(*urls)
+      @urls.append(*urls)
+    end
+    def register(name, &handler)
+      @handlers[name] = handler
+    end
+    def handler_for(name)
+      @handlers[name]
+    end
+    def http
+      HTTP.use(logging: {logger: logger}).headers(headers).follow
+    end
+    def request(url)
+      http.get(url).tap do |response|
+        response.extend HTTP::ActAsNokogiriDocument
+      end
+    end
+    def dup
+      self.class.new.tap do |obj|
+        obj.urls.replace(urls)
+        obj.handlers.update(handlers)
+        obj.headers.update(headers)
+        obj.logger = logger
+      end
+    end
+  protected
+    # Allow access for dup
+    attr_reader :handlers
+  end
+end

data/lib/spiderman/tasks.rake ADDED

@@ -0,0 +1,36 @@
+namespace :spiderman do
+  # Load the environment and eager load all classes
+  task :environment => :environment do
+    if defined?(Rails)
+      ActiveSupport.run_load_hooks(:before_eager_load, Rails.configuration)
+      Rails.configuration.eager_load_namespaces.each(&:eager_load!)
+    end
+    if defined?(Zeitwerk)
+      Zeitwerk::Loader.eager_load_all
+    end
+  end
+  desc "Run crawlers"
+  task :run, [:crawler] => :environment do |task, args|
+    Spiderman.run(args[:crawler])
+  end
+  desc "List available crawlers"
+  task list: :environment do
+    puts Spiderman.list
+  end
+  desc ""
+  task :debug, [:crawler, :url, :type] => :environment do |task, args|
+    unless crawler = Spiderman.find(args[:crawler])
+      raise "Can't find crawler with name `#{args[:crawler]}`. " \
+        "To list all available crawlers, run: `$ rake crawler:list`"
+    end
+    crawler.parse!(args[:url], args[:type])
+  end
+end
+task spiderman: 'spiderman:run'

data/lib/spiderman/version.rb ADDED

@@ -0,0 +1,3 @@
+module Spiderman
+  VERSION = "2.0.0"
+end

data/spiderman.gemspec ADDED

@@ -0,0 +1,33 @@
+require_relative 'lib/spiderman/version'
+Gem::Specification.new do |spec|
+  spec.name          = "spiderman"
+  spec.version       = Spiderman::VERSION
+  spec.authors       = ["Brandon Keepers"]
+  spec.email         = ["brandon@opensoul.org"]
+  spec.summary       = %q{your friendly neighborhood web crawler}
+  spec.description   = spec.summary
+  spec.homepage      = "https://github.com/bkeepers/spiderman"
+  spec.license       = "MIT"
+  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
+  spec.metadata["allowed_push_host"] = "https://rubygems.org"
+  spec.metadata["homepage_uri"] = spec.homepage
+  spec.metadata["source_code_uri"] = spec.homepage
+  spec.metadata["changelog_uri"] = "#{spec.homepage}/releases"
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files         = Dir.chdir(File.expand_path('..', __FILE__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_runtime_dependency "http", "~> 4.0"
+  spec.add_runtime_dependency "nokogiri", "~> 1.10"
+  spec.add_runtime_dependency "activesupport", ">= 5.0"
+end

metadata ADDED

@@ -0,0 +1,107 @@
+--- !ruby/object:Gem::Specification
+name: spiderman
+version: !ruby/object:Gem::Version
+  version: 2.0.0
+platform: ruby
+authors:
+- Brandon Keepers
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2020-03-22 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: http
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '4.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '4.0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.10'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.10'
+- !ruby/object:Gem::Dependency
+  name: activesupport
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '5.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '5.0'
+description: your friendly neighborhood web crawler
+email:
+- brandon@opensoul.org
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- lib/http/acts_as_nokogiri_document.rb
+- lib/http/mime_type/html.rb
+- lib/spiderman.rb
+- lib/spiderman/railtie.rb
+- lib/spiderman/runner.rb
+- lib/spiderman/tasks.rake
+- lib/spiderman/version.rb
+- spiderman.gemspec
+homepage: https://github.com/bkeepers/spiderman
+licenses:
+- MIT
+metadata:
+  allowed_push_host: https://rubygems.org
+  homepage_uri: https://github.com/bkeepers/spiderman
+  source_code_uri: https://github.com/bkeepers/spiderman
+  changelog_uri: https://github.com/bkeepers/spiderman/releases
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 2.3.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.0.3
+signing_key:
+specification_version: 4
+summary: your friendly neighborhood web crawler
+test_files: []