scruber 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +39 -0
  8. data/Rakefile +6 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/scruber +4 -0
  12. data/lib/scruber/app_searcher.rb +31 -0
  13. data/lib/scruber/cli/project_generator.rb +47 -0
  14. data/lib/scruber/cli/templates/Gemfile.tt +6 -0
  15. data/lib/scruber/cli/templates/application.tt +18 -0
  16. data/lib/scruber/cli/templates/bin/scruber.tt +6 -0
  17. data/lib/scruber/cli/templates/boot.tt +3 -0
  18. data/lib/scruber/cli/templates/gitignore.tt +12 -0
  19. data/lib/scruber/cli/templates/initializers/proxies.tt +10 -0
  20. data/lib/scruber/cli/templates/initializers/user_agents.tt +14 -0
  21. data/lib/scruber/cli/templates/scrapers/sample.tt +7 -0
  22. data/lib/scruber/cli.rb +40 -0
  23. data/lib/scruber/core/configuration.rb +30 -0
  24. data/lib/scruber/core/crawler.rb +92 -0
  25. data/lib/scruber/core/extensions/base.rb +26 -0
  26. data/lib/scruber/core/extensions/csv_output.rb +62 -0
  27. data/lib/scruber/core/extensions/loop.rb +39 -0
  28. data/lib/scruber/core/page_format/base.rb +11 -0
  29. data/lib/scruber/core/page_format/html.rb +13 -0
  30. data/lib/scruber/core/page_format/xml.rb +13 -0
  31. data/lib/scruber/core/page_format.rb +33 -0
  32. data/lib/scruber/fetcher.rb +34 -0
  33. data/lib/scruber/fetcher_adapters/abstract_adapter.rb +119 -0
  34. data/lib/scruber/fetcher_adapters/typhoeus_fetcher.rb +78 -0
  35. data/lib/scruber/helpers/dictionary_reader/csv.rb +27 -0
  36. data/lib/scruber/helpers/dictionary_reader/xml.rb +23 -0
  37. data/lib/scruber/helpers/dictionary_reader.rb +33 -0
  38. data/lib/scruber/helpers/fetcher_agent.rb +40 -0
  39. data/lib/scruber/helpers/fetcher_agent_adapters/abstract_adapter.rb +69 -0
  40. data/lib/scruber/helpers/fetcher_agent_adapters/memory.rb +41 -0
  41. data/lib/scruber/helpers/proxy_rotator.rb +125 -0
  42. data/lib/scruber/helpers/user_agent_rotator.rb +91 -0
  43. data/lib/scruber/queue.rb +34 -0
  44. data/lib/scruber/queue_adapters/abstract_adapter.rb +112 -0
  45. data/lib/scruber/queue_adapters/memory.rb +70 -0
  46. data/lib/scruber/version.rb +3 -0
  47. data/lib/scruber.rb +69 -0
  48. data/scruber.gemspec +43 -0
  49. metadata +233 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6aecb299dd5c70bfe864f65d2faed704cdc3cde0
4
+ data.tar.gz: f0697ca03b0f1552e03fb1ee70d0265d5b8bed04
5
+ SHA512:
6
+ metadata.gz: 2f52170ba8af4152b88e694537fa9e55156fa6d082c1e9ec409ae46808dc9ca28ff47ed241e8072c0edbba13b3e0be9f00504aef4542da52c3a280c282d8df71
7
+ data.tar.gz: 580d4c3840526dba3e62826433f8dddd09cdc42f569a2db05beb69b3428bfbd0fea92dc2faa0367175870327605f27f960e04be3d65d5c2dbd3c44c61719252d
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ .ruby-version
11
+ .ruby-gemset
12
+ todo
13
+
14
+ # rspec failure tracking
15
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.1
5
+ before_install: gem install bundler -v 1.15.3
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in scruber.gemspec
6
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 Ivan Goncharov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # Scruber
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/scruber`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'scruber'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install scruber
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scruber.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "scruber"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/exe/scruber ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require "scruber/cli"
3
+
4
+ Scruber::CLI::Root.start(ARGV)
@@ -0,0 +1,31 @@
1
+ module Scruber
2
+ module AppSearcher
3
+
4
+ extend self
5
+
6
+ RUBY = Gem.ruby
7
+ EXECUTABLES = ["bin/scruber"]
8
+
9
+ def exec_app(name)
10
+ original_cwd = Dir.pwd
11
+
12
+ loop do
13
+ if exe = find_executable
14
+ exec RUBY, exe, *ARGV
15
+ break # non reachable, hack to be able to stub exec in the test suite
16
+ end
17
+
18
+ # If we exhaust the search there is no executable, this could be a
19
+ # call to generate a new application, so restore the original cwd.
20
+ Dir.chdir(original_cwd) && return if Pathname.new(Dir.pwd).root?
21
+
22
+ # Otherwise keep moving upwards in search of an executable.
23
+ Dir.chdir("..")
24
+ end
25
+ end
26
+
27
+ def find_executable
28
+ EXECUTABLES.find { |exe| File.file?(exe) }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,47 @@
1
+ require "thor"
2
+ require 'thor/group'
3
+ require 'fileutils'
4
+
5
+ module Scruber
6
+ module CLI
7
+ class ProjectGenerator < Thor::Group
8
+ include Thor::Actions
9
+
10
+ argument :path
11
+ class_option :queue, :default => 'memory', :aliases => '-q'
12
+ class_option :fetcher_agent, :default => 'memory', :aliases => '-fa'
13
+
14
+ def self.source_root
15
+ File.dirname(__FILE__) + '/templates'
16
+ end
17
+
18
+ def create_directories
19
+ raise ::Thor::Error, "ERROR: #{path} already exists." if File.exist?(path)
20
+ say "Creating scruber project at #{path}"
21
+ FileUtils.mkdir_p(path)
22
+ end
23
+
24
+ def create_files
25
+ template 'Gemfile.tt', path+'/Gemfile'
26
+ template 'gitignore.tt', path+'/.gitignore'
27
+ template 'bin/scruber.tt', path+'/bin/scruber'
28
+ template 'application.tt', path+'/config/application.rb'
29
+ template 'boot.tt', path+'/config/boot.rb'
30
+ template 'boot.tt', path+'/config/boot.rb'
31
+ template 'initializers/proxies.tt', path+'/config/initializers/proxies.rb'
32
+ template 'initializers/user_agents.tt', path+'/config/initializers/user_agents.rb'
33
+ template 'scrapers/sample.tt', path+'/scrapers/sample.rb'
34
+ end
35
+
36
+ def init_project
37
+ inside path do
38
+ run "bundle"
39
+ end
40
+ end
41
+
42
+ def print_instructions
43
+ say "Run `scruber start sample` to run sample scraper"
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem 'scruber'
4
+ <% if @queue == 'mongo' %>
5
+ gem 'scruber-mongo'
6
+ <% end %>
@@ -0,0 +1,18 @@
1
+ require File.expand_path('../boot', __FILE__)
2
+
3
+ Bundler.require(:default)
4
+
5
+ Scruber.configure do |config|
6
+ config.fetcher_adapter = :typhoeus_fetcher
7
+ config.fetcher_options = {
8
+ max_concurrency: 1,
9
+ max_retry_times: 5,
10
+ retry_delays: [1,2,2,4,4],
11
+ followlocation: false,
12
+ request_timeout: 15,
13
+ }
14
+ config.fetcher_agent_adapter = :<%= options[:fetcher_agent] %>
15
+ config.fetcher_agent_options = {}
16
+ config.queue_adapter = :<%= options[:queue] %>
17
+ config.queue_options = {}
18
+ end
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ APP_PATH = File.expand_path('../config/application', __dir__)
3
+ require "scruber/cli"
4
+ require_relative '../config/application'
5
+
6
+ Scruber::CLI::Root.start(ARGV)
@@ -0,0 +1,3 @@
1
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__)
2
+
3
+ require 'bundler/setup' # Set up gems listed in the Gemfile.
@@ -0,0 +1,12 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+
11
+ # rspec failure tracking
12
+ .rspec_status
@@ -0,0 +1,10 @@
1
+ # Scruber::Core::Extensions::Loop.add_dictionary(:proxy_list, File.expand_path(File.dirname(__FILE__))+'/proxies.xml', :xml)
2
+
3
+ # Scruber::Helpers::ProxyRotator.configure do
4
+ # clean
5
+ # set_mode :round_robin
6
+
7
+ # loop :proxy_list do |ua|
8
+ # add ua['name'], tags: ua['tags'].split(',').map(&:strip)
9
+ # end
10
+ # end
@@ -0,0 +1,14 @@
1
+ # Scruber::Core::Extensions::Loop.add_dictionary(:user_agents, File.expand_path(File.dirname(__FILE__))+'/user_agents.xml', :xml)
2
+
3
+ Scruber::Helpers::UserAgentRotator.configure do
4
+ clean
5
+ set_filter :all
6
+
7
+ add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36", tags: [:desktop, :modern, :chrome]
8
+
9
+ # How to access proxy_list dictionary
10
+ #
11
+ # loop :user_agents do |ua|
12
+ # add ua['name'], tags: ua['tags'].split(',').map(&:strip)
13
+ # end
14
+ end
@@ -0,0 +1,7 @@
1
+ Scruber.run do
2
+ queue.add "http://example.com"
3
+
4
+ parser :seed do |page|
5
+ puts page.response_body
6
+ end
7
+ end
@@ -0,0 +1,40 @@
1
+ require "thor"
2
+ require "scruber"
3
+ require "scruber/cli/project_generator"
4
+ require "scruber/app_searcher"
5
+
6
+ module Scruber
7
+ module CLI
8
+
9
+ class Root < Thor
10
+ def self.exit_on_failure?
11
+ true
12
+ end
13
+
14
+ register(ProjectGenerator, "new", "new PATH", "Create new project")
15
+
16
+ desc 'start', 'Run scraper'
17
+ def start(name)
18
+ if defined?(APP_PATH)
19
+ # raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
20
+ raise ::Thor::Error, "ERROR: Scraper not found." unless File.exist?(File.expand_path('../../scrapers/'+name+'.rb', APP_PATH))
21
+ say "booting..."
22
+ require APP_PATH
23
+ Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
24
+ require i
25
+ end
26
+ say "starting #{name}"
27
+ require File.expand_path('../../scrapers/'+name+'.rb', APP_PATH)
28
+ else
29
+ Scruber::AppSearcher.exec_app(name)
30
+ end
31
+ end
32
+
33
+ desc 'version', 'Display version'
34
+ map %w[-v --version] => :version
35
+ def version
36
+ say "Scruber #{VERSION}"
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,30 @@
1
+ module Scruber
2
+ module Core
3
+ class Configuration
4
+ attr_accessor :fetcher_adapter,
5
+ :fetcher_options,
6
+ :fetcher_agent_adapter,
7
+ :fetcher_agent_options,
8
+ :queue_adapter,
9
+ :queue_options
10
+
11
+ def initialize
12
+ @fetcher_adapter = :typhoeus_fetcher
13
+ @fetcher_options = {}
14
+ @fetcher_agent_adapter = :memory
15
+ @fetcher_agent_options = {}
16
+ @queue_adapter = :memory
17
+ @queue_options = {}
18
+ end
19
+
20
+ def merge_options(options)
21
+ @fetcher_adapter = options.fetch(:fetcher_adapter){ @fetcher_adapter }
22
+ @fetcher_options.merge! options.fetch(:fetcher_options){ {} }
23
+ @fetcher_agent_adapter = options.fetch(:fetcher_agent_adapter){ @fetcher_agent_adapter }
24
+ @fetcher_agent_options.merge! options.fetch(:fetcher_agent_options){ {} }
25
+ @queue_adapter = options.fetch(:queue_adapter){ @queue_adapter }
26
+ @queue_options.merge! options.fetch(:queue_options){ {} }
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,92 @@
1
+ module Scruber
2
+ module Core
3
+ class Crawler
4
+ attr_reader :queue, :fetcher
5
+
6
+ def initialize(options={})
7
+ Scruber.configuration.merge_options(options)
8
+ @callbacks_options = {}
9
+ @callbacks = {}
10
+ @on_complete_callbacks = {}
11
+ @queue = Scruber::Queue.new
12
+ @fetcher = Scruber::Fetcher.new
13
+ load_extenstions
14
+ end
15
+
16
+ #
17
+ # Run crawling.
18
+ #
19
+ # @param block [Proc] crawler body
20
+ def run(&block)
21
+ instance_eval &block
22
+ while @queue.has_work? do
23
+ @fetcher.run @queue
24
+ while page = @queue.fetch_downloaded do
25
+ if @callbacks[page.page_type.to_sym]
26
+ processed_page = process_page(page, page.page_type.to_sym)
27
+ instance_exec page, processed_page, &(@callbacks[page.page_type.to_sym])
28
+ end
29
+ end
30
+ end
31
+ @on_complete_callbacks.each do |_,callback|
32
+ instance_exec &(callback)
33
+ end
34
+ end
35
+
36
+ def parser(page_type, options={}, &block)
37
+ register_callback(page_type, options, &block)
38
+ end
39
+
40
+ def method_missing(method_sym, *arguments, &block)
41
+ Scruber::Core::Crawler._registered_method_missings.find do |(pattern, func)|
42
+ if method_sym.to_s =~ pattern
43
+ instance_exec method_sym, arguments, &(func)
44
+ true
45
+ else
46
+ false
47
+ end
48
+ end || super
49
+ end
50
+
51
+ def respond_to?(method_sym, include_private = false)
52
+ !Scruber::Core::Crawler._registered_method_missings.find do |(pattern, block)|
53
+ if method_sym.to_s =~ pattern
54
+ true
55
+ else
56
+ false
57
+ end
58
+ end.nil? || super(method_sym, include_private)
59
+ end
60
+
61
+ class << self
62
+ def register_method_missing(pattern, &block)
63
+ _registered_method_missings[pattern] = block
64
+ end
65
+
66
+ def _registered_method_missings
67
+ @registered_method_missings ||= {}
68
+ end
69
+ end
70
+
71
+ private
72
+
73
+ def register_callback(page_type, options, &block)
74
+ @callbacks_options[page_type.to_sym] = options || {}
75
+ @callbacks[page_type.to_sym] = block
76
+ end
77
+
78
+ def on_complete_callback(name, &block)
79
+ @on_complete_callbacks[name] = block
80
+ end
81
+
82
+ def process_page(page, page_type)
83
+ page_format = @callbacks_options[page_type].fetch(:page_format){ nil }
84
+ Scruber::Core::PageFormat.process(page, page_format)
85
+ end
86
+
87
+ def load_extenstions
88
+ Scruber::Core::Extensions::Base.descendants.each(&:register)
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,26 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class Base
5
+ module CoreMethods
6
+
7
+ end
8
+
9
+ class << self
10
+ def register
11
+ Scruber::Core::Crawler.include self.const_get(:CoreMethods)
12
+ end
13
+
14
+ def inherited(subclass)
15
+ @descendants ||= []
16
+ @descendants << subclass
17
+ end
18
+
19
+ def descendants
20
+ @descendants
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,62 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class CsvOutput < Base
5
+ module CoreMethods
6
+ def csv_file(path, options={})
7
+ file_id = options.fetch(:file_id) { :default }.to_sym
8
+ options.delete(:file_id)
9
+ Scruber::Core::Extensions::CsvOutput.register_csv file_id, path, options
10
+ on_complete_callback :close_csv_files do
11
+ Scruber::Core::Extensions::CsvOutput.close_all
12
+ end
13
+ end
14
+
15
+ def csv_out(fields)
16
+ Scruber::Core::Extensions::CsvOutput.csv_out :default, fields
17
+ end
18
+
19
+ def self.included(base)
20
+ Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, args|
21
+ file_id = meth.to_s.scan(/\Acsv_(\w+)_file\Z/).first.first.to_sym
22
+ path, options = args
23
+ options = {} if options.nil?
24
+ csv_file path, options.merge({file_id: file_id})
25
+ Scruber::Core::Crawler.class_eval do
26
+ define_method "csv_#{file_id}_out".to_sym do |fields|
27
+ Scruber::Core::Extensions::CsvOutput.csv_out(file_id, fields)
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ class << self
35
+ def csv_out(file_id, fields)
36
+ if _registered_files.keys.include?(file_id.to_sym)
37
+ _registered_files[file_id.to_sym] << fields
38
+ else
39
+ raise ArgumentError, "file #{file_id.inspect} not registered"
40
+ end
41
+ end
42
+
43
+ def register_csv(file_id, path, options)
44
+ mode = options.fetch(:mode){ 'wb' }
45
+ options.delete(:mode)
46
+ _registered_files[file_id] = CSV.open(path, mode, options)
47
+ end
48
+
49
+ def _registered_files
50
+ @registered_files ||= {}
51
+ end
52
+
53
+ def close_all
54
+ _registered_files.each{ |_,f| f.close }
55
+ @registered_files = {}
56
+ end
57
+ end
58
+
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,39 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class Loop < Base
5
+ module CoreMethods
6
+ def loop(dictionary, options={}, &block)
7
+ Scruber::Core::Extensions::Loop.loop dictionary, options do |*args|
8
+ instance_exec *args, &block
9
+ end
10
+ end
11
+ end
12
+
13
+ class << self
14
+ def loop(dictionary, options={})
15
+ if _registered_dictionaries.keys.include?(dictionary.to_sym)
16
+ Scruber::Helpers::DictionaryReader.read(_registered_dictionaries[dictionary.to_sym][:file_path], _registered_dictionaries[dictionary.to_sym][:file_type], options) do |obj|
17
+ yield obj
18
+ end
19
+ else
20
+ raise ArgumentError, "dictionary not registered, available dictionaries #{_registered_dictionaries.keys.inspect}"
21
+ end
22
+ end
23
+
24
+ def add_dictionary(name, file_path, file_type)
25
+ _registered_dictionaries[name.to_sym] = {
26
+ file_path: file_path,
27
+ file_type: file_type
28
+ }
29
+ end
30
+
31
+ def _registered_dictionaries
32
+ @registered_dictionaries ||= {}
33
+ end
34
+ end
35
+
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,11 @@
1
+ module Scruber
2
+ module Core
3
+ module PageFormat
4
+ class Base
5
+ def self.process(page)
6
+ raise NotImplementedError
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,13 @@
1
+ module Scruber
2
+ module Core
3
+ module PageFormat
4
+ class HTML < Base
5
+ def self.process(page)
6
+ Nokogiri::HTML(page.response_body) rescue nil
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ Scruber::Core::PageFormat.add(:html, Scruber::Core::PageFormat::HTML)
@@ -0,0 +1,13 @@
1
+ module Scruber
2
+ module Core
3
+ module PageFormat
4
+ class XML < Base
5
+ def self.process(page)
6
+ Nokogiri.parse(page.response_body) rescue nil
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ Scruber::Core::PageFormat.add(:xml, Scruber::Core::PageFormat::XML)