scruber 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +39 -0
  8. data/Rakefile +6 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/scruber +4 -0
  12. data/lib/scruber/app_searcher.rb +31 -0
  13. data/lib/scruber/cli/project_generator.rb +47 -0
  14. data/lib/scruber/cli/templates/Gemfile.tt +6 -0
  15. data/lib/scruber/cli/templates/application.tt +18 -0
  16. data/lib/scruber/cli/templates/bin/scruber.tt +6 -0
  17. data/lib/scruber/cli/templates/boot.tt +3 -0
  18. data/lib/scruber/cli/templates/gitignore.tt +12 -0
  19. data/lib/scruber/cli/templates/initializers/proxies.tt +10 -0
  20. data/lib/scruber/cli/templates/initializers/user_agents.tt +14 -0
  21. data/lib/scruber/cli/templates/scrapers/sample.tt +7 -0
  22. data/lib/scruber/cli.rb +40 -0
  23. data/lib/scruber/core/configuration.rb +30 -0
  24. data/lib/scruber/core/crawler.rb +92 -0
  25. data/lib/scruber/core/extensions/base.rb +26 -0
  26. data/lib/scruber/core/extensions/csv_output.rb +62 -0
  27. data/lib/scruber/core/extensions/loop.rb +39 -0
  28. data/lib/scruber/core/page_format/base.rb +11 -0
  29. data/lib/scruber/core/page_format/html.rb +13 -0
  30. data/lib/scruber/core/page_format/xml.rb +13 -0
  31. data/lib/scruber/core/page_format.rb +33 -0
  32. data/lib/scruber/fetcher.rb +34 -0
  33. data/lib/scruber/fetcher_adapters/abstract_adapter.rb +119 -0
  34. data/lib/scruber/fetcher_adapters/typhoeus_fetcher.rb +78 -0
  35. data/lib/scruber/helpers/dictionary_reader/csv.rb +27 -0
  36. data/lib/scruber/helpers/dictionary_reader/xml.rb +23 -0
  37. data/lib/scruber/helpers/dictionary_reader.rb +33 -0
  38. data/lib/scruber/helpers/fetcher_agent.rb +40 -0
  39. data/lib/scruber/helpers/fetcher_agent_adapters/abstract_adapter.rb +69 -0
  40. data/lib/scruber/helpers/fetcher_agent_adapters/memory.rb +41 -0
  41. data/lib/scruber/helpers/proxy_rotator.rb +125 -0
  42. data/lib/scruber/helpers/user_agent_rotator.rb +91 -0
  43. data/lib/scruber/queue.rb +34 -0
  44. data/lib/scruber/queue_adapters/abstract_adapter.rb +112 -0
  45. data/lib/scruber/queue_adapters/memory.rb +70 -0
  46. data/lib/scruber/version.rb +3 -0
  47. data/lib/scruber.rb +69 -0
  48. data/scruber.gemspec +43 -0
  49. metadata +233 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6aecb299dd5c70bfe864f65d2faed704cdc3cde0
4
+ data.tar.gz: f0697ca03b0f1552e03fb1ee70d0265d5b8bed04
5
+ SHA512:
6
+ metadata.gz: 2f52170ba8af4152b88e694537fa9e55156fa6d082c1e9ec409ae46808dc9ca28ff47ed241e8072c0edbba13b3e0be9f00504aef4542da52c3a280c282d8df71
7
+ data.tar.gz: 580d4c3840526dba3e62826433f8dddd09cdc42f569a2db05beb69b3428bfbd0fea92dc2faa0367175870327605f27f960e04be3d65d5c2dbd3c44c61719252d
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ .ruby-version
11
+ .ruby-gemset
12
+ todo
13
+
14
+ # rspec failure tracking
15
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.1
5
+ before_install: gem install bundler -v 1.15.3
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in scruber.gemspec
6
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 Ivan Goncharov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # Scruber
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/scruber`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'scruber'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install scruber
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scruber.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "scruber"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/exe/scruber ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require "scruber/cli"
3
+
4
+ Scruber::CLI::Root.start(ARGV)
@@ -0,0 +1,31 @@
1
+ module Scruber
2
+ module AppSearcher
3
+
4
+ extend self
5
+
6
+ RUBY = Gem.ruby
7
+ EXECUTABLES = ["bin/scruber"]
8
+
9
+ def exec_app(name)
10
+ original_cwd = Dir.pwd
11
+
12
+ loop do
13
+ if exe = find_executable
14
+ exec RUBY, exe, *ARGV
15
+ break # non reachable, hack to be able to stub exec in the test suite
16
+ end
17
+
18
+ # If we exhaust the search there is no executable, this could be a
19
+ # call to generate a new application, so restore the original cwd.
20
+ Dir.chdir(original_cwd) && return if Pathname.new(Dir.pwd).root?
21
+
22
+ # Otherwise keep moving upwards in search of an executable.
23
+ Dir.chdir("..")
24
+ end
25
+ end
26
+
27
+ def find_executable
28
+ EXECUTABLES.find { |exe| File.file?(exe) }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,47 @@
1
+ require "thor"
2
+ require 'thor/group'
3
+ require 'fileutils'
4
+
5
+ module Scruber
6
+ module CLI
7
+ class ProjectGenerator < Thor::Group
8
+ include Thor::Actions
9
+
10
+ argument :path
11
+ class_option :queue, :default => 'memory', :aliases => '-q'
12
+ class_option :fetcher_agent, :default => 'memory', :aliases => '-fa'
13
+
14
+ def self.source_root
15
+ File.dirname(__FILE__) + '/templates'
16
+ end
17
+
18
+ def create_directories
19
+ raise ::Thor::Error, "ERROR: #{path} already exists." if File.exist?(path)
20
+ say "Creating scruber project at #{path}"
21
+ FileUtils.mkdir_p(path)
22
+ end
23
+
24
+ def create_files
25
+ template 'Gemfile.tt', path+'/Gemfile'
26
+ template 'gitignore.tt', path+'/.gitignore'
27
+ template 'bin/scruber.tt', path+'/bin/scruber'
28
+ template 'application.tt', path+'/config/application.rb'
29
+ template 'boot.tt', path+'/config/boot.rb'
30
+ template 'boot.tt', path+'/config/boot.rb'
31
+ template 'initializers/proxies.tt', path+'/config/initializers/proxies.rb'
32
+ template 'initializers/user_agents.tt', path+'/config/initializers/user_agents.rb'
33
+ template 'scrapers/sample.tt', path+'/scrapers/sample.rb'
34
+ end
35
+
36
+ def init_project
37
+ inside path do
38
+ run "bundle"
39
+ end
40
+ end
41
+
42
+ def print_instructions
43
+ say "Run `scruber start sample` to run sample scraper"
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem 'scruber'
4
+ <% if @queue == 'mongo' %>
5
+ gem 'scruber-mongo'
6
+ <% end %>
@@ -0,0 +1,18 @@
1
+ require File.expand_path('../boot', __FILE__)
2
+
3
+ Bundler.require(:default)
4
+
5
+ Scruber.configure do |config|
6
+ config.fetcher_adapter = :typhoeus_fetcher
7
+ config.fetcher_options = {
8
+ max_concurrency: 1,
9
+ max_retry_times: 5,
10
+ retry_delays: [1,2,2,4,4],
11
+ followlocation: false,
12
+ request_timeout: 15,
13
+ }
14
+ config.fetcher_agent_adapter = :<%= options[:fetcher_agent] %>
15
+ config.fetcher_agent_options = {}
16
+ config.queue_adapter = :<%= options[:queue] %>
17
+ config.queue_options = {}
18
+ end
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ APP_PATH = File.expand_path('../config/application', __dir__)
3
+ require "scruber/cli"
4
+ require_relative '../config/application'
5
+
6
+ Scruber::CLI::Root.start(ARGV)
@@ -0,0 +1,3 @@
1
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__)
2
+
3
+ require 'bundler/setup' # Set up gems listed in the Gemfile.
@@ -0,0 +1,12 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+
11
+ # rspec failure tracking
12
+ .rspec_status
@@ -0,0 +1,10 @@
1
+ # Scruber::Core::Extensions::Loop.add_dictionary(:proxy_list, File.expand_path(File.dirname(__FILE__))+'/proxies.xml', :xml)
2
+
3
+ # Scruber::Helpers::ProxyRotator.configure do
4
+ # clean
5
+ # set_mode :round_robin
6
+
7
+ # loop :proxy_list do |ua|
8
+ # add ua['name'], tags: ua['tags'].split(',').map(&:strip)
9
+ # end
10
+ # end
@@ -0,0 +1,14 @@
1
+ # Scruber::Core::Extensions::Loop.add_dictionary(:user_agents, File.expand_path(File.dirname(__FILE__))+'/user_agents.xml', :xml)
2
+
3
+ Scruber::Helpers::UserAgentRotator.configure do
4
+ clean
5
+ set_filter :all
6
+
7
+ add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36", tags: [:desktop, :modern, :chrome]
8
+
9
+ # How to access proxy_list dictionary
10
+ #
11
+ # loop :user_agents do |ua|
12
+ # add ua['name'], tags: ua['tags'].split(',').map(&:strip)
13
+ # end
14
+ end
@@ -0,0 +1,7 @@
1
+ Scruber.run do
2
+ queue.add "http://example.com"
3
+
4
+ parser :seed do |page|
5
+ puts page.response_body
6
+ end
7
+ end
@@ -0,0 +1,40 @@
1
+ require "thor"
2
+ require "scruber"
3
+ require "scruber/cli/project_generator"
4
+ require "scruber/app_searcher"
5
+
6
+ module Scruber
7
+ module CLI
8
+
9
+ class Root < Thor
10
+ def self.exit_on_failure?
11
+ true
12
+ end
13
+
14
+ register(ProjectGenerator, "new", "new PATH", "Create new project")
15
+
16
+ desc 'start', 'Run scraper'
17
+ def start(name)
18
+ if defined?(APP_PATH)
19
+ # raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
20
+ raise ::Thor::Error, "ERROR: Scraper not found." unless File.exist?(File.expand_path('../../scrapers/'+name+'.rb', APP_PATH))
21
+ say "booting..."
22
+ require APP_PATH
23
+ Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
24
+ require i
25
+ end
26
+ say "starting #{name}"
27
+ require File.expand_path('../../scrapers/'+name+'.rb', APP_PATH)
28
+ else
29
+ Scruber::AppSearcher.exec_app(name)
30
+ end
31
+ end
32
+
33
+ desc 'version', 'Display version'
34
+ map %w[-v --version] => :version
35
+ def version
36
+ say "Scruber #{VERSION}"
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,30 @@
1
+ module Scruber
2
+ module Core
3
+ class Configuration
4
+ attr_accessor :fetcher_adapter,
5
+ :fetcher_options,
6
+ :fetcher_agent_adapter,
7
+ :fetcher_agent_options,
8
+ :queue_adapter,
9
+ :queue_options
10
+
11
+ def initialize
12
+ @fetcher_adapter = :typhoeus_fetcher
13
+ @fetcher_options = {}
14
+ @fetcher_agent_adapter = :memory
15
+ @fetcher_agent_options = {}
16
+ @queue_adapter = :memory
17
+ @queue_options = {}
18
+ end
19
+
20
+ def merge_options(options)
21
+ @fetcher_adapter = options.fetch(:fetcher_adapter){ @fetcher_adapter }
22
+ @fetcher_options.merge! options.fetch(:fetcher_options){ {} }
23
+ @fetcher_agent_adapter = options.fetch(:fetcher_agent_adapter){ @fetcher_agent_adapter }
24
+ @fetcher_agent_options.merge! options.fetch(:fetcher_agent_options){ {} }
25
+ @queue_adapter = options.fetch(:queue_adapter){ @queue_adapter }
26
+ @queue_options.merge! options.fetch(:queue_options){ {} }
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,92 @@
1
+ module Scruber
2
+ module Core
3
+ class Crawler
4
+ attr_reader :queue, :fetcher
5
+
6
+ def initialize(options={})
7
+ Scruber.configuration.merge_options(options)
8
+ @callbacks_options = {}
9
+ @callbacks = {}
10
+ @on_complete_callbacks = {}
11
+ @queue = Scruber::Queue.new
12
+ @fetcher = Scruber::Fetcher.new
13
+ load_extenstions
14
+ end
15
+
16
+ #
17
+ # Run crawling.
18
+ #
19
+ # @param block [Proc] crawler body
20
+ def run(&block)
21
+ instance_eval &block
22
+ while @queue.has_work? do
23
+ @fetcher.run @queue
24
+ while page = @queue.fetch_downloaded do
25
+ if @callbacks[page.page_type.to_sym]
26
+ processed_page = process_page(page, page.page_type.to_sym)
27
+ instance_exec page, processed_page, &(@callbacks[page.page_type.to_sym])
28
+ end
29
+ end
30
+ end
31
+ @on_complete_callbacks.each do |_,callback|
32
+ instance_exec &(callback)
33
+ end
34
+ end
35
+
36
+ def parser(page_type, options={}, &block)
37
+ register_callback(page_type, options, &block)
38
+ end
39
+
40
+ def method_missing(method_sym, *arguments, &block)
41
+ Scruber::Core::Crawler._registered_method_missings.find do |(pattern, func)|
42
+ if method_sym.to_s =~ pattern
43
+ instance_exec method_sym, arguments, &(func)
44
+ true
45
+ else
46
+ false
47
+ end
48
+ end || super
49
+ end
50
+
51
+ def respond_to?(method_sym, include_private = false)
52
+ !Scruber::Core::Crawler._registered_method_missings.find do |(pattern, block)|
53
+ if method_sym.to_s =~ pattern
54
+ true
55
+ else
56
+ false
57
+ end
58
+ end.nil? || super(method_sym, include_private)
59
+ end
60
+
61
+ class << self
62
+ def register_method_missing(pattern, &block)
63
+ _registered_method_missings[pattern] = block
64
+ end
65
+
66
+ def _registered_method_missings
67
+ @registered_method_missings ||= {}
68
+ end
69
+ end
70
+
71
+ private
72
+
73
+ def register_callback(page_type, options, &block)
74
+ @callbacks_options[page_type.to_sym] = options || {}
75
+ @callbacks[page_type.to_sym] = block
76
+ end
77
+
78
+ def on_complete_callback(name, &block)
79
+ @on_complete_callbacks[name] = block
80
+ end
81
+
82
+ def process_page(page, page_type)
83
+ page_format = @callbacks_options[page_type].fetch(:page_format){ nil }
84
+ Scruber::Core::PageFormat.process(page, page_format)
85
+ end
86
+
87
+ def load_extenstions
88
+ Scruber::Core::Extensions::Base.descendants.each(&:register)
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,26 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class Base
5
+ module CoreMethods
6
+
7
+ end
8
+
9
+ class << self
10
+ def register
11
+ Scruber::Core::Crawler.include self.const_get(:CoreMethods)
12
+ end
13
+
14
+ def inherited(subclass)
15
+ @descendants ||= []
16
+ @descendants << subclass
17
+ end
18
+
19
+ def descendants
20
+ @descendants
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,62 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class CsvOutput < Base
5
+ module CoreMethods
6
+ def csv_file(path, options={})
7
+ file_id = options.fetch(:file_id) { :default }.to_sym
8
+ options.delete(:file_id)
9
+ Scruber::Core::Extensions::CsvOutput.register_csv file_id, path, options
10
+ on_complete_callback :close_csv_files do
11
+ Scruber::Core::Extensions::CsvOutput.close_all
12
+ end
13
+ end
14
+
15
+ def csv_out(fields)
16
+ Scruber::Core::Extensions::CsvOutput.csv_out :default, fields
17
+ end
18
+
19
+ def self.included(base)
20
+ Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, args|
21
+ file_id = meth.to_s.scan(/\Acsv_(\w+)_file\Z/).first.first.to_sym
22
+ path, options = args
23
+ options = {} if options.nil?
24
+ csv_file path, options.merge({file_id: file_id})
25
+ Scruber::Core::Crawler.class_eval do
26
+ define_method "csv_#{file_id}_out".to_sym do |fields|
27
+ Scruber::Core::Extensions::CsvOutput.csv_out(file_id, fields)
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ class << self
35
+ def csv_out(file_id, fields)
36
+ if _registered_files.keys.include?(file_id.to_sym)
37
+ _registered_files[file_id.to_sym] << fields
38
+ else
39
+ raise ArgumentError, "file #{file_id.inspect} not registered"
40
+ end
41
+ end
42
+
43
+ def register_csv(file_id, path, options)
44
+ mode = options.fetch(:mode){ 'wb' }
45
+ options.delete(:mode)
46
+ _registered_files[file_id] = CSV.open(path, mode, options)
47
+ end
48
+
49
+ def _registered_files
50
+ @registered_files ||= {}
51
+ end
52
+
53
+ def close_all
54
+ _registered_files.each{ |_,f| f.close }
55
+ @registered_files = {}
56
+ end
57
+ end
58
+
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,39 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class Loop < Base
5
+ module CoreMethods
6
+ def loop(dictionary, options={}, &block)
7
+ Scruber::Core::Extensions::Loop.loop dictionary, options do |*args|
8
+ instance_exec *args, &block
9
+ end
10
+ end
11
+ end
12
+
13
+ class << self
14
+ def loop(dictionary, options={})
15
+ if _registered_dictionaries.keys.include?(dictionary.to_sym)
16
+ Scruber::Helpers::DictionaryReader.read(_registered_dictionaries[dictionary.to_sym][:file_path], _registered_dictionaries[dictionary.to_sym][:file_type], options) do |obj|
17
+ yield obj
18
+ end
19
+ else
20
+ raise ArgumentError, "dictionary not registered, available dictionaries #{_registered_dictionaries.keys.inspect}"
21
+ end
22
+ end
23
+
24
+ def add_dictionary(name, file_path, file_type)
25
+ _registered_dictionaries[name.to_sym] = {
26
+ file_path: file_path,
27
+ file_type: file_type
28
+ }
29
+ end
30
+
31
+ def _registered_dictionaries
32
+ @registered_dictionaries ||= {}
33
+ end
34
+ end
35
+
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,11 @@
1
+ module Scruber
2
+ module Core
3
+ module PageFormat
4
+ class Base
5
+ def self.process(page)
6
+ raise NotImplementedError
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,13 @@
1
+ module Scruber
2
+ module Core
3
+ module PageFormat
4
+ class HTML < Base
5
+ def self.process(page)
6
+ Nokogiri::HTML(page.response_body) rescue nil
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ Scruber::Core::PageFormat.add(:html, Scruber::Core::PageFormat::HTML)
@@ -0,0 +1,13 @@
1
+ module Scruber
2
+ module Core
3
+ module PageFormat
4
+ class XML < Base
5
+ def self.process(page)
6
+ Nokogiri.parse(page.response_body) rescue nil
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ Scruber::Core::PageFormat.add(:xml, Scruber::Core::PageFormat::XML)