spiderman 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ba6dbe7099be528ca38656fe766a6a836fb4e15205dc5bda3c7564580ca6bd26
4
+ data.tar.gz: d6b49ceaf38b77d19066f8eb9a17e2c4a081fda6b98a53e10d19af82343ddb98
5
+ SHA512:
6
+ metadata.gz: 9aa9ebbbb420617cef05a71a1824a128e15eaca34aba94aa2e1972d9a9d1f972fbf517e24bec6cc5beea0f9acf859c844771ed55f9130c393fec834e1a13168d
7
+ data.tar.gz: 528743c4fb28e391749b954d5ad8698b7fdac8aaff8438ced572b491e9986126bb15853aa6cb5eeb2af1e77f6b1b19587de58298abc99480aae80f24c9046117
@@ -0,0 +1,12 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+ vendor/gems
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,6 @@
1
+ ---
2
+ language: ruby
3
+ cache: bundler
4
+ rvm:
5
+ - 2.6.5
6
+ before_install: gem install bundler -v 2.1.4
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
5
+ gem "rake", "~> 12.0"
6
+ gem "rspec", "~> 3.0"
7
+ gem "webmock"
@@ -0,0 +1,85 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ spiderman (2.0.0)
5
+ activesupport (>= 5.0)
6
+ http (~> 4.0)
7
+ nokogiri (~> 1.10)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ activesupport (6.0.2.2)
13
+ concurrent-ruby (~> 1.0, >= 1.0.2)
14
+ i18n (>= 0.7, < 2)
15
+ minitest (~> 5.1)
16
+ tzinfo (~> 1.1)
17
+ zeitwerk (~> 2.2)
18
+ addressable (2.7.0)
19
+ public_suffix (>= 2.0.2, < 5.0)
20
+ concurrent-ruby (1.1.6)
21
+ crack (0.4.3)
22
+ safe_yaml (~> 1.0.0)
23
+ diff-lcs (1.3)
24
+ domain_name (0.5.20190701)
25
+ unf (>= 0.0.5, < 1.0.0)
26
+ ffi (1.12.2)
27
+ ffi-compiler (1.0.1)
28
+ ffi (>= 1.0.0)
29
+ rake
30
+ hashdiff (1.0.1)
31
+ http (4.3.0)
32
+ addressable (~> 2.3)
33
+ http-cookie (~> 1.0)
34
+ http-form_data (~> 2.2)
35
+ http-parser (~> 1.2.0)
36
+ http-cookie (1.0.3)
37
+ domain_name (~> 0.5)
38
+ http-form_data (2.3.0)
39
+ http-parser (1.2.1)
40
+ ffi-compiler (>= 1.0, < 2.0)
41
+ i18n (1.8.2)
42
+ concurrent-ruby (~> 1.0)
43
+ mini_portile2 (2.4.0)
44
+ minitest (5.14.0)
45
+ nokogiri (1.10.9)
46
+ mini_portile2 (~> 2.4.0)
47
+ public_suffix (4.0.3)
48
+ rake (12.3.3)
49
+ rspec (3.9.0)
50
+ rspec-core (~> 3.9.0)
51
+ rspec-expectations (~> 3.9.0)
52
+ rspec-mocks (~> 3.9.0)
53
+ rspec-core (3.9.1)
54
+ rspec-support (~> 3.9.1)
55
+ rspec-expectations (3.9.1)
56
+ diff-lcs (>= 1.2.0, < 2.0)
57
+ rspec-support (~> 3.9.0)
58
+ rspec-mocks (3.9.1)
59
+ diff-lcs (>= 1.2.0, < 2.0)
60
+ rspec-support (~> 3.9.0)
61
+ rspec-support (3.9.2)
62
+ safe_yaml (1.0.5)
63
+ thread_safe (0.3.6)
64
+ tzinfo (1.2.6)
65
+ thread_safe (~> 0.1)
66
+ unf (0.1.4)
67
+ unf_ext
68
+ unf_ext (0.0.7.6)
69
+ webmock (3.8.3)
70
+ addressable (>= 2.3.6)
71
+ crack (>= 0.3.2)
72
+ hashdiff (>= 0.4.0, < 2.0.0)
73
+ zeitwerk (2.3.0)
74
+
75
+ PLATFORMS
76
+ ruby
77
+
78
+ DEPENDENCIES
79
+ rake (~> 12.0)
80
+ rspec (~> 3.0)
81
+ spiderman!
82
+ webmock
83
+
84
+ BUNDLED WITH
85
+ 2.1.4
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Brandon Keepers
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,86 @@
1
+
2
+ <div align="center">
3
+ <h1><img width="300" height="300" src="https://user-images.githubusercontent.com/173/77249168-99488080-6c15-11ea-98de-3d14a412265d.png" alt="Spiderman"></h1>
4
+
5
+ <h2>your friendly neighborhood web crawler</h2>
6
+ </div>
7
+
8
+ Spiderman is a Ruby gem for crawling and processing web pages.
9
+
10
+ ## Installation
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ ```ruby
15
+ gem 'spiderman'
16
+ ```
17
+
18
+ And then execute:
19
+
20
+ $ bundle install
21
+
22
+ Or install it yourself as:
23
+
24
+ $ gem install spiderman
25
+
26
+ ## Usage
27
+
28
+ ```ruby
29
+ class HackerNewsCrawler
30
+ include Spiderman
31
+
32
+ crawl "https://news.ycombinator.com/" do |response|
33
+ response.css('a.storylink').each do |a|
34
+ process! a["href"], :story
35
+ end
36
+ end
37
+
38
+ process :story do |response|
39
+ logging.info "#{response.uri} #{response.css('title').text}"
40
+ save_page(response)
41
+ end
42
+
43
+ def save_page(page)
44
+ # logic here for saving the page
45
+ end
46
+ end
47
+ ```
48
+
49
+ Run the crawler:
50
+
51
+ ```ruby
52
+ HackerNewsCrawler.crawl!
53
+ ```
54
+
55
+ ### ActiveJob
56
+
57
+ Spiderman works with [ActiveJob](https://edgeguides.rubyonrails.org/active_job_basics.html) out of the box. If your crawler class inherits from `ActiveJob:Base`, then requests will be made in your background worker. Each request will run as a separate job.
58
+
59
+ ```ruby
60
+ class MyCrawer < ActiveJob::Base
61
+ queue_as :crawler
62
+
63
+ crawl "https://example.com" do |response|
64
+ response.css('a').each {|a| process! a["href"], :link }
65
+ end
66
+
67
+ process :link do |response|
68
+ logger.info "Processing #{response.uri}"
69
+ end
70
+ end
71
+ ```
72
+
73
+ ## Development
74
+
75
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
76
+
77
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
78
+
79
+ ## Contributing
80
+
81
+ Bug reports and pull requests are welcome on GitHub at https://github.com/bkeepers/spiderman.
82
+
83
+
84
+ ## License
85
+
86
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "spiderman"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,13 @@
1
+ module HTTP
2
+ # Module to mix into `HTTP::Response` to make it act like a Nokogiri doc
3
+ module ActAsNokogiriDocument
4
+ def document
5
+ return @document if defined?(@document)
6
+ @document = parse(content_type.mime_type)
7
+ end
8
+
9
+ def method_missing(method, *args, &block)
10
+ document.send(method, *args, &block)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,18 @@
1
+ module HTTP
2
+ module MimeType
3
+ # This allows you to call `response.parse` and get back a Nokogiri object
4
+ # if the content type is HTML.
5
+ class HTML < Adapter
6
+ def encode(obj)
7
+ obj.to_s if obj
8
+ end
9
+
10
+ def decode(str)
11
+ Nokogiri::HTML(str)
12
+ end
13
+ end
14
+
15
+ register_adapter "text/html", HTML
16
+ register_alias "text/html", :html
17
+ end
18
+ end
@@ -0,0 +1,121 @@
1
+ require "logger"
2
+ require "http"
3
+ require "http/mime_type/html"
4
+ require "http/acts_as_nokogiri_document"
5
+ require "active_support/core_ext/class"
6
+ require "active_support/core_ext/module"
7
+ require "active_support/concern"
8
+ require "spiderman/version"
9
+ require "spiderman/runner"
10
+ require 'spiderman/railtie' if defined?(Rails)
11
+
12
+ # Turn any class into a crawler by including this module.
13
+ #
14
+ # Example:
15
+ #
16
+ # class MySpider < ApplicationJob # Yup, you can define this in a job
17
+ # queue_as :crawler
18
+ #
19
+ # include Spiderman
20
+ #
21
+ # crawl "https://example.com/" do |response|
22
+ # response.css('.selector a').each do |a|
23
+ # process! a["href"], :listing
24
+ # end
25
+ # end
26
+ #
27
+ # process :listing do |response|
28
+ # process! response.css('img'), :image
29
+ # save_the_thing response.css('.some_selector')
30
+ # end
31
+ #
32
+ # process :image do |response|
33
+ # # Do something with the image file
34
+ # end
35
+ #
36
+ # def save_the_thing(thing)
37
+ # # logic here for saving the thing
38
+ # end
39
+ # end
40
+ #
41
+ module Spiderman
42
+ extend ActiveSupport::Concern
43
+
44
+ included do
45
+ Spiderman.add(self)
46
+ class_attribute :crawler, instance_reader: true, default: Runner.new
47
+
48
+ delegate :logger, to: :crawler
49
+ end
50
+
51
+ class_methods do
52
+ delegate :crawl!, :process!, to: :new
53
+
54
+ # Use `crawl` to specify URLs to start with. `crawl` accepts one or more
55
+ # URLs, and will call the block for each URL requested. You can also
56
+ # define multiple `crawl` blocks with different behavior for each
57
+ # starting URL. All `crawl` blocks will be called when calling
58
+ # `SpiderName.crawl!`.
59
+ #
60
+ # `response` is an enhanced `HTTP::Response` object that also acts like a
61
+ # `Nokogiri::HTML` document, e.g. `response.css(…)`
62
+ def crawl(*urls, &block)
63
+ urls.each { |url| crawler.register(url, &block) }
64
+ crawler.start_at(*urls)
65
+ end
66
+
67
+ # Processors are called from `crawl` and can be used to handle different
68
+ # types of responsezs.
69
+ def process(type, &block)
70
+ crawler.register(type, &block)
71
+ end
72
+
73
+ def inherited(subclass)
74
+ subclass.crawler = crawler.dup
75
+ Spiderman.add(subclass)
76
+ end
77
+ end
78
+
79
+ def crawl!
80
+ crawler.urls.each do |url|
81
+ process! url
82
+ end
83
+ end
84
+
85
+ def process!(url, with = nil)
86
+ if defined?(ActiveJob) && self.is_a?(ActiveJob::Base)
87
+ self.class.perform_later(url.to_s, with)
88
+ else
89
+ perform(url, with)
90
+ end
91
+ end
92
+
93
+ def perform(url, with = nil)
94
+ handler = crawler.handler_for(with || url)
95
+ response = crawler.request(url)
96
+ instance_exec response, &handler
97
+ end
98
+
99
+ def name
100
+ self.class.name.demodulize
101
+ end
102
+
103
+ module_function
104
+
105
+ def list
106
+ @list ||= []
107
+ end
108
+
109
+ def run(crawler = nil)
110
+ crawlers = crawler ? [find(crawler)] : list
111
+ crawlers.each(&:crawl!)
112
+ end
113
+
114
+ def find(name)
115
+ self.list.detect { |crawler| crawler.name.demodulize.underscore == name }
116
+ end
117
+
118
+ def add(clazz)
119
+ list.push(clazz)
120
+ end
121
+ end
@@ -0,0 +1,11 @@
1
+ module Spiderman
2
+ class Railtie < Rails::Railtie
3
+ initializer "spiderman" do
4
+ Spiderman::Runner.logger = Rails.logger
5
+ end
6
+
7
+ rake_tasks do
8
+ load "spiderman/tasks.rake"
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,47 @@
1
+ module Spiderman
2
+ class Runner
3
+ class_attribute :logger, instance_accessor: true, default: Logger.new(STDOUT, level: :info)
4
+ attr_reader :urls, :headers, :handlers
5
+
6
+ def initialize
7
+ @urls = []
8
+ @handlers = {}
9
+ @headers = {}
10
+ end
11
+
12
+ def start_at(*urls)
13
+ @urls.append(*urls)
14
+ end
15
+
16
+ def register(name, &handler)
17
+ @handlers[name] = handler
18
+ end
19
+
20
+ def handler_for(name)
21
+ @handlers[name]
22
+ end
23
+
24
+ def http
25
+ HTTP.use(logging: {logger: logger}).headers(headers).follow
26
+ end
27
+
28
+ def request(url)
29
+ http.get(url).tap do |response|
30
+ response.extend HTTP::ActAsNokogiriDocument
31
+ end
32
+ end
33
+
34
+ def dup
35
+ self.class.new.tap do |obj|
36
+ obj.urls.replace(urls)
37
+ obj.handlers.update(handlers)
38
+ obj.headers.update(headers)
39
+ obj.logger = logger
40
+ end
41
+ end
42
+
43
+ protected
44
+ # Allow access for dup
45
+ attr_reader :handlers
46
+ end
47
+ end
@@ -0,0 +1,36 @@
1
+ namespace :spiderman do
2
+ # Load the environment and eager load all classes
3
+ task :environment => :environment do
4
+ if defined?(Rails)
5
+ ActiveSupport.run_load_hooks(:before_eager_load, Rails.configuration)
6
+ Rails.configuration.eager_load_namespaces.each(&:eager_load!)
7
+ end
8
+
9
+ if defined?(Zeitwerk)
10
+ Zeitwerk::Loader.eager_load_all
11
+ end
12
+ end
13
+
14
+ desc "Run crawlers"
15
+ task :run, [:crawler] => :environment do |task, args|
16
+ Spiderman.run(args[:crawler])
17
+ end
18
+
19
+ desc "List available crawlers"
20
+ task list: :environment do
21
+ puts Spiderman.list
22
+ end
23
+
24
+ desc ""
25
+ task :debug, [:crawler, :url, :type] => :environment do |task, args|
26
+ unless crawler = Spiderman.find(args[:crawler])
27
+ raise "Can't find crawler with name `#{args[:crawler]}`. " \
28
+ "To list all available crawlers, run: `$ rake crawler:list`"
29
+ end
30
+
31
+ crawler.parse!(args[:url], args[:type])
32
+ end
33
+
34
+ end
35
+
36
+ task spiderman: 'spiderman:run'
@@ -0,0 +1,3 @@
1
+ module Spiderman
2
+ VERSION = "2.0.0"
3
+ end
@@ -0,0 +1,33 @@
1
+ require_relative 'lib/spiderman/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "spiderman"
5
+ spec.version = Spiderman::VERSION
6
+ spec.authors = ["Brandon Keepers"]
7
+ spec.email = ["brandon@opensoul.org"]
8
+
9
+ spec.summary = %q{your friendly neighborhood web crawler}
10
+ spec.description = spec.summary
11
+ spec.homepage = "https://github.com/bkeepers/spiderman"
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
+
15
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = spec.homepage
19
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/releases"
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
24
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
25
+ end
26
+ spec.bindir = "exe"
27
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_runtime_dependency "http", "~> 4.0"
31
+ spec.add_runtime_dependency "nokogiri", "~> 1.10"
32
+ spec.add_runtime_dependency "activesupport", ">= 5.0"
33
+ end
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spiderman
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Brandon Keepers
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-03-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: http
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '4.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '4.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.10'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.10'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '5.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '5.0'
55
+ description: your friendly neighborhood web crawler
56
+ email:
57
+ - brandon@opensoul.org
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".rspec"
64
+ - ".travis.yml"
65
+ - Gemfile
66
+ - Gemfile.lock
67
+ - LICENSE.txt
68
+ - README.md
69
+ - Rakefile
70
+ - bin/console
71
+ - bin/setup
72
+ - lib/http/acts_as_nokogiri_document.rb
73
+ - lib/http/mime_type/html.rb
74
+ - lib/spiderman.rb
75
+ - lib/spiderman/railtie.rb
76
+ - lib/spiderman/runner.rb
77
+ - lib/spiderman/tasks.rake
78
+ - lib/spiderman/version.rb
79
+ - spiderman.gemspec
80
+ homepage: https://github.com/bkeepers/spiderman
81
+ licenses:
82
+ - MIT
83
+ metadata:
84
+ allowed_push_host: https://rubygems.org
85
+ homepage_uri: https://github.com/bkeepers/spiderman
86
+ source_code_uri: https://github.com/bkeepers/spiderman
87
+ changelog_uri: https://github.com/bkeepers/spiderman/releases
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: 2.3.0
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ requirements: []
103
+ rubygems_version: 3.0.3
104
+ signing_key:
105
+ specification_version: 4
106
+ summary: your friendly neighborhood web crawler
107
+ test_files: []