spiderman 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ba6dbe7099be528ca38656fe766a6a836fb4e15205dc5bda3c7564580ca6bd26
4
+ data.tar.gz: d6b49ceaf38b77d19066f8eb9a17e2c4a081fda6b98a53e10d19af82343ddb98
5
+ SHA512:
6
+ metadata.gz: 9aa9ebbbb420617cef05a71a1824a128e15eaca34aba94aa2e1972d9a9d1f972fbf517e24bec6cc5beea0f9acf859c844771ed55f9130c393fec834e1a13168d
7
+ data.tar.gz: 528743c4fb28e391749b954d5ad8698b7fdac8aaff8438ced572b491e9986126bb15853aa6cb5eeb2af1e77f6b1b19587de58298abc99480aae80f24c9046117
@@ -0,0 +1,12 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+ vendor/gems
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,6 @@
1
+ ---
2
+ language: ruby
3
+ cache: bundler
4
+ rvm:
5
+ - 2.6.5
6
+ before_install: gem install bundler -v 2.1.4
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
5
+ gem "rake", "~> 12.0"
6
+ gem "rspec", "~> 3.0"
7
+ gem "webmock"
@@ -0,0 +1,85 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ spiderman (2.0.0)
5
+ activesupport (>= 5.0)
6
+ http (~> 4.0)
7
+ nokogiri (~> 1.10)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ activesupport (6.0.2.2)
13
+ concurrent-ruby (~> 1.0, >= 1.0.2)
14
+ i18n (>= 0.7, < 2)
15
+ minitest (~> 5.1)
16
+ tzinfo (~> 1.1)
17
+ zeitwerk (~> 2.2)
18
+ addressable (2.7.0)
19
+ public_suffix (>= 2.0.2, < 5.0)
20
+ concurrent-ruby (1.1.6)
21
+ crack (0.4.3)
22
+ safe_yaml (~> 1.0.0)
23
+ diff-lcs (1.3)
24
+ domain_name (0.5.20190701)
25
+ unf (>= 0.0.5, < 1.0.0)
26
+ ffi (1.12.2)
27
+ ffi-compiler (1.0.1)
28
+ ffi (>= 1.0.0)
29
+ rake
30
+ hashdiff (1.0.1)
31
+ http (4.3.0)
32
+ addressable (~> 2.3)
33
+ http-cookie (~> 1.0)
34
+ http-form_data (~> 2.2)
35
+ http-parser (~> 1.2.0)
36
+ http-cookie (1.0.3)
37
+ domain_name (~> 0.5)
38
+ http-form_data (2.3.0)
39
+ http-parser (1.2.1)
40
+ ffi-compiler (>= 1.0, < 2.0)
41
+ i18n (1.8.2)
42
+ concurrent-ruby (~> 1.0)
43
+ mini_portile2 (2.4.0)
44
+ minitest (5.14.0)
45
+ nokogiri (1.10.9)
46
+ mini_portile2 (~> 2.4.0)
47
+ public_suffix (4.0.3)
48
+ rake (12.3.3)
49
+ rspec (3.9.0)
50
+ rspec-core (~> 3.9.0)
51
+ rspec-expectations (~> 3.9.0)
52
+ rspec-mocks (~> 3.9.0)
53
+ rspec-core (3.9.1)
54
+ rspec-support (~> 3.9.1)
55
+ rspec-expectations (3.9.1)
56
+ diff-lcs (>= 1.2.0, < 2.0)
57
+ rspec-support (~> 3.9.0)
58
+ rspec-mocks (3.9.1)
59
+ diff-lcs (>= 1.2.0, < 2.0)
60
+ rspec-support (~> 3.9.0)
61
+ rspec-support (3.9.2)
62
+ safe_yaml (1.0.5)
63
+ thread_safe (0.3.6)
64
+ tzinfo (1.2.6)
65
+ thread_safe (~> 0.1)
66
+ unf (0.1.4)
67
+ unf_ext
68
+ unf_ext (0.0.7.6)
69
+ webmock (3.8.3)
70
+ addressable (>= 2.3.6)
71
+ crack (>= 0.3.2)
72
+ hashdiff (>= 0.4.0, < 2.0.0)
73
+ zeitwerk (2.3.0)
74
+
75
+ PLATFORMS
76
+ ruby
77
+
78
+ DEPENDENCIES
79
+ rake (~> 12.0)
80
+ rspec (~> 3.0)
81
+ spiderman!
82
+ webmock
83
+
84
+ BUNDLED WITH
85
+ 2.1.4
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Brandon Keepers
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,86 @@
1
+
2
+ <div align="center">
3
+ <h1><img width="300" height="300" src="https://user-images.githubusercontent.com/173/77249168-99488080-6c15-11ea-98de-3d14a412265d.png" alt="Spiderman"></h1>
4
+
5
+ <h2>your friendly neighborhood web crawler</h2>
6
+ </div>
7
+
8
+ Spiderman is a Ruby gem for crawling and processing web pages.
9
+
10
+ ## Installation
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ ```ruby
15
+ gem 'spiderman'
16
+ ```
17
+
18
+ And then execute:
19
+
20
+ $ bundle install
21
+
22
+ Or install it yourself as:
23
+
24
+ $ gem install spiderman
25
+
26
+ ## Usage
27
+
28
+ ```ruby
29
+ class HackerNewsCrawler
30
+ include Spiderman
31
+
32
+ crawl "https://news.ycombinator.com/" do |response|
33
+ response.css('a.storylink').each do |a|
34
+ process! a["href"], :story
35
+ end
36
+ end
37
+
38
+ process :story do |response|
39
+ logging.info "#{response.uri} #{response.css('title').text}"
40
+ save_page(response)
41
+ end
42
+
43
+ def save_page(page)
44
+ # logic here for saving the page
45
+ end
46
+ end
47
+ ```
48
+
49
+ Run the crawler:
50
+
51
+ ```ruby
52
+ HackerNewsCrawler.crawl!
53
+ ```
54
+
55
+ ### ActiveJob
56
+
57
+ Spiderman works with [ActiveJob](https://edgeguides.rubyonrails.org/active_job_basics.html) out of the box. If your crawler class inherits from `ActiveJob:Base`, then requests will be made in your background worker. Each request will run as a separate job.
58
+
59
+ ```ruby
60
+ class MyCrawer < ActiveJob::Base
61
+ queue_as :crawler
62
+
63
+ crawl "https://example.com" do |response|
64
+ response.css('a').each {|a| process! a["href"], :link }
65
+ end
66
+
67
+ process :link do |response|
68
+ logger.info "Processing #{response.uri}"
69
+ end
70
+ end
71
+ ```
72
+
73
+ ## Development
74
+
75
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
76
+
77
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
78
+
79
+ ## Contributing
80
+
81
+ Bug reports and pull requests are welcome on GitHub at https://github.com/bkeepers/spiderman.
82
+
83
+
84
+ ## License
85
+
86
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "spiderman"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,13 @@
1
+ module HTTP
2
+ # Module to mix into `HTTP::Response` to make it act like a Nokogiri doc
3
+ module ActAsNokogiriDocument
4
+ def document
5
+ return @document if defined?(@document)
6
+ @document = parse(content_type.mime_type)
7
+ end
8
+
9
+ def method_missing(method, *args, &block)
10
+ document.send(method, *args, &block)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,18 @@
1
+ module HTTP
2
+ module MimeType
3
+ # This allows you to call `response.parse` and get back a Nokogiri object
4
+ # if the content type is HTML.
5
+ class HTML < Adapter
6
+ def encode(obj)
7
+ obj.to_s if obj
8
+ end
9
+
10
+ def decode(str)
11
+ Nokogiri::HTML(str)
12
+ end
13
+ end
14
+
15
+ register_adapter "text/html", HTML
16
+ register_alias "text/html", :html
17
+ end
18
+ end
@@ -0,0 +1,121 @@
1
+ require "logger"
2
+ require "http"
3
+ require "http/mime_type/html"
4
+ require "http/acts_as_nokogiri_document"
5
+ require "active_support/core_ext/class"
6
+ require "active_support/core_ext/module"
7
+ require "active_support/concern"
8
+ require "spiderman/version"
9
+ require "spiderman/runner"
10
+ require 'spiderman/railtie' if defined?(Rails)
11
+
12
+ # Turn any class into a crawler by including this module.
13
+ #
14
+ # Example:
15
+ #
16
+ # class MySpider < ApplicationJob # Yup, you can define this in a job
17
+ # queue_as :crawler
18
+ #
19
+ # include Spiderman
20
+ #
21
+ # crawl "https://example.com/" do |response|
22
+ # response.css('.selector a').each do |a|
23
+ # process! a["href"], :listing
24
+ # end
25
+ # end
26
+ #
27
+ # process :listing do |response|
28
+ # process! response.css('img'), :image
29
+ # save_the_thing response.css('.some_selector')
30
+ # end
31
+ #
32
+ # process :image do |response|
33
+ # # Do something with the image file
34
+ # end
35
+ #
36
+ # def save_the_thing(thing)
37
+ # # logic here for saving the thing
38
+ # end
39
+ # end
40
+ #
41
+ module Spiderman
42
+ extend ActiveSupport::Concern
43
+
44
+ included do
45
+ Spiderman.add(self)
46
+ class_attribute :crawler, instance_reader: true, default: Runner.new
47
+
48
+ delegate :logger, to: :crawler
49
+ end
50
+
51
+ class_methods do
52
+ delegate :crawl!, :process!, to: :new
53
+
54
+ # Use `crawl` to specify URLs to start with. `crawl` accepts one or more
55
+ # URLs, and will call the block for each URL requested. You can also
56
+ # define multiple `crawl` blocks with different behavior for each
57
+ # starting URL. All `crawl` blocks will be called when calling
58
+ # `SpiderName.crawl!`.
59
+ #
60
+ # `response` is an enhanced `HTTP::Response` object that also acts like a
61
+ # `Nokogiri::HTML` document, e.g. `response.css(…)`
62
+ def crawl(*urls, &block)
63
+ urls.each { |url| crawler.register(url, &block) }
64
+ crawler.start_at(*urls)
65
+ end
66
+
67
+ # Processors are called from `crawl` and can be used to handle different
68
+ # types of responsezs.
69
+ def process(type, &block)
70
+ crawler.register(type, &block)
71
+ end
72
+
73
+ def inherited(subclass)
74
+ subclass.crawler = crawler.dup
75
+ Spiderman.add(subclass)
76
+ end
77
+ end
78
+
79
+ def crawl!
80
+ crawler.urls.each do |url|
81
+ process! url
82
+ end
83
+ end
84
+
85
+ def process!(url, with = nil)
86
+ if defined?(ActiveJob) && self.is_a?(ActiveJob::Base)
87
+ self.class.perform_later(url.to_s, with)
88
+ else
89
+ perform(url, with)
90
+ end
91
+ end
92
+
93
+ def perform(url, with = nil)
94
+ handler = crawler.handler_for(with || url)
95
+ response = crawler.request(url)
96
+ instance_exec response, &handler
97
+ end
98
+
99
+ def name
100
+ self.class.name.demodulize
101
+ end
102
+
103
+ module_function
104
+
105
+ def list
106
+ @list ||= []
107
+ end
108
+
109
+ def run(crawler = nil)
110
+ crawlers = crawler ? [find(crawler)] : list
111
+ crawlers.each(&:crawl!)
112
+ end
113
+
114
+ def find(name)
115
+ self.list.detect { |crawler| crawler.name.demodulize.underscore == name }
116
+ end
117
+
118
+ def add(clazz)
119
+ list.push(clazz)
120
+ end
121
+ end
@@ -0,0 +1,11 @@
1
+ module Spiderman
2
+ class Railtie < Rails::Railtie
3
+ initializer "spiderman" do
4
+ Spiderman::Runner.logger = Rails.logger
5
+ end
6
+
7
+ rake_tasks do
8
+ load "spiderman/tasks.rake"
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,47 @@
1
+ module Spiderman
2
+ class Runner
3
+ class_attribute :logger, instance_accessor: true, default: Logger.new(STDOUT, level: :info)
4
+ attr_reader :urls, :headers, :handlers
5
+
6
+ def initialize
7
+ @urls = []
8
+ @handlers = {}
9
+ @headers = {}
10
+ end
11
+
12
+ def start_at(*urls)
13
+ @urls.append(*urls)
14
+ end
15
+
16
+ def register(name, &handler)
17
+ @handlers[name] = handler
18
+ end
19
+
20
+ def handler_for(name)
21
+ @handlers[name]
22
+ end
23
+
24
+ def http
25
+ HTTP.use(logging: {logger: logger}).headers(headers).follow
26
+ end
27
+
28
+ def request(url)
29
+ http.get(url).tap do |response|
30
+ response.extend HTTP::ActAsNokogiriDocument
31
+ end
32
+ end
33
+
34
+ def dup
35
+ self.class.new.tap do |obj|
36
+ obj.urls.replace(urls)
37
+ obj.handlers.update(handlers)
38
+ obj.headers.update(headers)
39
+ obj.logger = logger
40
+ end
41
+ end
42
+
43
+ protected
44
+ # Allow access for dup
45
+ attr_reader :handlers
46
+ end
47
+ end
@@ -0,0 +1,36 @@
1
+ namespace :spiderman do
2
+ # Load the environment and eager load all classes
3
+ task :environment => :environment do
4
+ if defined?(Rails)
5
+ ActiveSupport.run_load_hooks(:before_eager_load, Rails.configuration)
6
+ Rails.configuration.eager_load_namespaces.each(&:eager_load!)
7
+ end
8
+
9
+ if defined?(Zeitwerk)
10
+ Zeitwerk::Loader.eager_load_all
11
+ end
12
+ end
13
+
14
+ desc "Run crawlers"
15
+ task :run, [:crawler] => :environment do |task, args|
16
+ Spiderman.run(args[:crawler])
17
+ end
18
+
19
+ desc "List available crawlers"
20
+ task list: :environment do
21
+ puts Spiderman.list
22
+ end
23
+
24
+ desc ""
25
+ task :debug, [:crawler, :url, :type] => :environment do |task, args|
26
+ unless crawler = Spiderman.find(args[:crawler])
27
+ raise "Can't find crawler with name `#{args[:crawler]}`. " \
28
+ "To list all available crawlers, run: `$ rake crawler:list`"
29
+ end
30
+
31
+ crawler.parse!(args[:url], args[:type])
32
+ end
33
+
34
+ end
35
+
36
+ task spiderman: 'spiderman:run'
@@ -0,0 +1,3 @@
1
+ module Spiderman
2
+ VERSION = "2.0.0"
3
+ end
@@ -0,0 +1,33 @@
1
+ require_relative 'lib/spiderman/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "spiderman"
5
+ spec.version = Spiderman::VERSION
6
+ spec.authors = ["Brandon Keepers"]
7
+ spec.email = ["brandon@opensoul.org"]
8
+
9
+ spec.summary = %q{your friendly neighborhood web crawler}
10
+ spec.description = spec.summary
11
+ spec.homepage = "https://github.com/bkeepers/spiderman"
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
+
15
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = spec.homepage
19
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/releases"
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
24
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
25
+ end
26
+ spec.bindir = "exe"
27
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_runtime_dependency "http", "~> 4.0"
31
+ spec.add_runtime_dependency "nokogiri", "~> 1.10"
32
+ spec.add_runtime_dependency "activesupport", ">= 5.0"
33
+ end
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spiderman
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Brandon Keepers
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-03-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: http
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '4.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '4.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.10'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.10'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '5.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '5.0'
55
+ description: your friendly neighborhood web crawler
56
+ email:
57
+ - brandon@opensoul.org
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".rspec"
64
+ - ".travis.yml"
65
+ - Gemfile
66
+ - Gemfile.lock
67
+ - LICENSE.txt
68
+ - README.md
69
+ - Rakefile
70
+ - bin/console
71
+ - bin/setup
72
+ - lib/http/acts_as_nokogiri_document.rb
73
+ - lib/http/mime_type/html.rb
74
+ - lib/spiderman.rb
75
+ - lib/spiderman/railtie.rb
76
+ - lib/spiderman/runner.rb
77
+ - lib/spiderman/tasks.rake
78
+ - lib/spiderman/version.rb
79
+ - spiderman.gemspec
80
+ homepage: https://github.com/bkeepers/spiderman
81
+ licenses:
82
+ - MIT
83
+ metadata:
84
+ allowed_push_host: https://rubygems.org
85
+ homepage_uri: https://github.com/bkeepers/spiderman
86
+ source_code_uri: https://github.com/bkeepers/spiderman
87
+ changelog_uri: https://github.com/bkeepers/spiderman/releases
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: 2.3.0
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ requirements: []
103
+ rubygems_version: 3.0.3
104
+ signing_key:
105
+ specification_version: 4
106
+ summary: your friendly neighborhood web crawler
107
+ test_files: []