scruber 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8cc15677a33ddfea6a5a02bc0051d16cffea104f
4
- data.tar.gz: 858c94d1eab0bd11688f719f9a012ef93bf198e8
3
+ metadata.gz: 937243b2503853e755c95800e0d34ae5ecb939a7
4
+ data.tar.gz: 1e2472ff1d1487da8df94292daa57a27d5b52c5a
5
5
  SHA512:
6
- metadata.gz: 410eb7d1f17efb474c9bbe5d39df02df4f52ef21994aebd8f4678ad86ef867da8a532cc1c96dc7803a531ea89282fcc5c59055323de74a6ea5f2719f6fe32154
7
- data.tar.gz: 4e3f22d81e1467012a7f60d124816b5a0a7cd2e76fe5fe48aa477ccace7bf2c6d5a2079f5a6ddcde2b8fd061c0cf7bf6225cb9cf0b1c49d8a86e2d4a51d9dbdb
6
+ metadata.gz: 2a30b80a4301ca18b87e1c91217ebdfd881e13c511e643ce198e7e9f28984d271b0133b11a065b6d971690fca49b948b289147884eed7c01ee5bfc985d35cc69
7
+ data.tar.gz: 77201fa5bbde27dfac6f7205711b05a3af9f85ab0ad705c6d08af5ea6b55f2c18c37a4e80878e3e7de7280ad945d20aca79aedc179087036f7bf176ecf05f597
data/exe/scruber CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
  require "scruber/cli"
3
+ require "scruber/app_searcher"
3
4
 
4
- Scruber::CLI::Root.start(ARGV)
5
+ Scruber::CLI::Root.start(ARGV) unless Scruber::AppSearcher.exec_app
data/lib/scruber.rb CHANGED
@@ -23,6 +23,8 @@ require "scruber/core/page_format/html"
23
23
  require "scruber/core/extensions/base"
24
24
  require "scruber/core/extensions/loop"
25
25
  require "scruber/core/extensions/csv_output"
26
+ require "scruber/core/extensions/queue_aliases"
27
+ require "scruber/core/extensions/parser_aliases"
26
28
 
27
29
  # require "scruber/core/configuration"
28
30
  # require "scruber/core/configuration"
@@ -52,10 +54,10 @@ module Scruber
52
54
  class << self
53
55
  attr_writer :configuration
54
56
 
55
- def run(scraper_name, options={}, &block)
57
+ def run(*args, &block)
56
58
  raise "You need a block to build!" unless block_given?
57
59
 
58
- Core::Crawler.new(scraper_name, options).run(&block)
60
+ Core::Crawler.new(*args).run(&block)
59
61
  end
60
62
 
61
63
  def configuration
@@ -6,7 +6,7 @@ module Scruber
6
6
  RUBY = Gem.ruby
7
7
  EXECUTABLES = ["bin/scruber"]
8
8
 
9
- def exec_app(name)
9
+ def exec_app
10
10
  original_cwd = Dir.pwd
11
11
 
12
12
  loop do
@@ -22,6 +22,7 @@ module Scruber
22
22
  # Otherwise keep moving upwards in search of an executable.
23
23
  Dir.chdir("..")
24
24
  end
25
+ true
25
26
  end
26
27
 
27
28
  def find_scraper(name, app_path)
data/lib/scruber/cli.rb CHANGED
@@ -1,22 +1,23 @@
1
1
  require "thor"
2
2
  require "scruber"
3
3
  require "scruber/cli/project_generator"
4
+ require "scruber/cli/generators"
4
5
  require "scruber/app_searcher"
5
6
 
6
7
  module Scruber
7
8
  module CLI
8
9
 
9
10
  class Root < Thor
10
- def self.exit_on_failure?
11
+ def self.exit_on_failure?
11
12
  true
12
13
  end
13
14
 
14
- register(ProjectGenerator, "new", "new PATH", "Create new project")
15
+ register ProjectGenerator, "new", "new PATH", "Create new project"
16
+ register Generators, 'generate', 'generate [GENERATOR]', 'Generate something'
15
17
 
16
18
  desc 'start', 'Run scraper'
17
19
  def start(name)
18
20
  if defined?(APP_PATH)
19
- # raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
20
21
  scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
21
22
  raise ::Thor::Error, "ERROR: Scraper not found." if scraper_path.nil?
22
23
  say "booting..."
@@ -24,10 +25,12 @@ module Scruber
24
25
  Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
25
26
  require i
26
27
  end
27
- say "starting #{name}"
28
+ ENV['SCRUBER_SCRAPER_NAME'] = File.basename(scraper_path).gsub(/\.rb\Z/, '').underscore
29
+ say "starting #{ENV['SCRUBER_SCRAPER_NAME']}"
30
+
28
31
  require scraper_path
29
32
  else
30
- Scruber::AppSearcher.exec_app(name)
33
+ raise ::Thor::Error, "ERROR: Scruber project not found."
31
34
  end
32
35
  end
33
36
 
@@ -0,0 +1,33 @@
1
+ require "thor"
2
+ require 'fileutils'
3
+
4
+ module Scruber
5
+ module CLI
6
+ class Generators < Thor
7
+
8
+ class ScraperGenerator < Thor::Group
9
+ include Thor::Actions
10
+
11
+ argument :name
12
+
13
+ def self.source_root
14
+ File.dirname(__FILE__) + '/templates'
15
+ end
16
+
17
+ def create_files
18
+ if defined?(APP_PATH)
19
+ scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
20
+ if scraper_path.present?
21
+ raise ::Thor::Error, "ERROR: Scraper already exists"
22
+ end
23
+ template 'scrapers/sample.tt', File.expand_path('../../scrapers/'+name+'.rb', APP_PATH)
24
+ else
25
+ raise ::Thor::Error, "ERROR: Scruber project not found."
26
+ end
27
+ end
28
+ end
29
+
30
+ register ScraperGenerator, 'scraper', 'scraper [NAME]', 'Generate scraper'
31
+ end
32
+ end
33
+ end
@@ -25,9 +25,9 @@ module Scruber
25
25
  template 'Gemfile.tt', path+'/Gemfile'
26
26
  template 'gitignore.tt', path+'/.gitignore'
27
27
  template 'bin/scruber.tt', path+'/bin/scruber'
28
+ chmod path+'/bin/scruber', '+x'
28
29
  template 'application.tt', path+'/config/application.rb'
29
30
  template 'boot.tt', path+'/config/boot.rb'
30
- template 'boot.tt', path+'/config/boot.rb'
31
31
  template 'initializers/proxies.tt', path+'/config/initializers/proxies.rb'
32
32
  template 'initializers/user_agents.tt', path+'/config/initializers/user_agents.rb'
33
33
  template 'scrapers/sample.tt', path+'/scrapers/sample.rb'
@@ -1,7 +1,9 @@
1
- Scruber.run :sample do
2
- queue.add "http://example.com"
1
+ Scruber.run do
2
+ get "http://..."
3
3
 
4
- parser :seed do |page|
5
- puts page.response_body
4
+ parse :html do |page, doc|
5
+ # page - queue page object
6
+ # doc - processed object, in this case Nokogiri::HTML(page.response_body) object
7
+ puts doc.at('title').text
6
8
  end
7
9
  end
@@ -3,8 +3,17 @@ module Scruber
3
3
  class Crawler
4
4
  attr_reader :queue, :fetcher, :scraper_name
5
5
 
6
- def initialize(scraper_name, options={})
7
- @scraper_name = scraper_name
6
+ def initialize(*args)
7
+ if args.first.is_a?(Hash)
8
+ scraper_name = nil
9
+ options = args.first
10
+ else
11
+ scraper_name, options = args
12
+ options ||= {}
13
+ end
14
+ @scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
15
+ raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
16
+ @scraper_name = @scraper_name.to_sym
8
17
  Scruber.configuration.merge_options(options)
9
18
  @callbacks_options = {}
10
19
  @callbacks = {}
@@ -26,6 +35,7 @@ module Scruber
26
35
  if @callbacks[page.page_type.to_sym]
27
36
  processed_page = process_page(page, page.page_type.to_sym)
28
37
  instance_exec page, processed_page, &(@callbacks[page.page_type.to_sym])
38
+ page.processed! unless page.sent_to_redownload?
29
39
  end
30
40
  end
31
41
  end
@@ -39,14 +49,12 @@ module Scruber
39
49
  end
40
50
 
41
51
  def method_missing(method_sym, *arguments, &block)
42
- Scruber::Core::Crawler._registered_method_missings.find do |(pattern, func)|
52
+ Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
43
53
  if (scan_results = method_sym.to_s.scan(pattern)).present?
44
- instance_exec method_sym, scan_results, arguments, &(func)
45
- true
46
- else
47
- false
54
+ return instance_exec(method_sym, scan_results, arguments+[block], &(func))
48
55
  end
49
- end || super
56
+ end
57
+ super
50
58
  end
51
59
 
52
60
  def respond_to?(method_sym, include_private = false)
@@ -81,7 +89,7 @@ module Scruber
81
89
  end
82
90
 
83
91
  def process_page(page, page_type)
84
- page_format = @callbacks_options[page_type].fetch(:page_format){ nil }
92
+ page_format = @callbacks_options[page_type].fetch(:format){ nil }
85
93
  Scruber::Core::PageFormat.process(page, page_format)
86
94
  end
87
95
 
@@ -0,0 +1,24 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class ParserAliases < Base
5
+ module CoreMethods
6
+ def parse(*args, &block)
7
+ page_format = args.shift
8
+ parser('seed', {format: page_format}, &block)
9
+ end
10
+
11
+ def self.included(base)
12
+ Scruber::Core::Crawler.register_method_missing /\Aparse_(\w+)\Z/ do |meth, scan_results, args|
13
+ page_type = scan_results.first.first
14
+ page_format = args.first.is_a?(Symbol) ? args.shift : nil
15
+ block = args.shift
16
+ parser(page_type, {format: page_format}, &block)
17
+ end
18
+ end
19
+ end
20
+
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,30 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class QueueAliases < Base
5
+ module CoreMethods
6
+ %w(get post head).each do |meth|
7
+ define_method meth.to_sym do |url, options={}|
8
+ queue.add url, options.merge({method: meth.to_sym})
9
+ end
10
+ end
11
+
12
+ def self.included(base)
13
+ Scruber::Core::Crawler.register_method_missing /\A(get|post|head)_(\w+)\Z/ do |m, scan_results, args|
14
+ meth, page_type = scan_results.first
15
+ url, options = args
16
+ options = {} if options.nil?
17
+ Scruber::Core::Crawler.class_eval do
18
+ define_method "#{meth}_#{page_type}".to_sym do |url, options={}|
19
+ queue.add url, options.merge({method: meth.to_sym, page_type: page_type})
20
+ end
21
+ end
22
+ queue.add url, options.merge({method: meth.to_sym, page_type: page_type})
23
+ end
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
30
+ end
@@ -3,7 +3,7 @@ module Scruber
3
3
  module PageFormat
4
4
  class XML < Base
5
5
  def self.process(page)
6
- Nokogiri.parse(page.response_body) rescue nil
6
+ Nokogiri::XML(page.response_body) rescue nil
7
7
  end
8
8
  end
9
9
  end
@@ -53,6 +53,7 @@ module Scruber
53
53
 
54
54
  @_fetcher_agent = false
55
55
  @_proxy = false
56
+ @_redownload = false
56
57
  end
57
58
 
58
59
  def fetcher_agent
@@ -96,6 +97,25 @@ module Scruber
96
97
  raise NotImplementedError
97
98
  end
98
99
 
100
+ def processed!
101
+ @processed_at = Time.now.to_i
102
+ @_redownload = false
103
+ save
104
+ end
105
+
106
+ def redownload!
107
+ @_redownload = true
108
+
109
+ @processed_at = nil
110
+ @retry_count += 1
111
+ @fetched_at = 0
112
+ @response_body = nil
113
+ save
114
+ end
115
+
116
+ def sent_to_redownload?
117
+ @_redownload
118
+ end
99
119
  end
100
120
 
101
121
  def initialize(options={})
@@ -113,6 +133,10 @@ module Scruber
113
133
  def fetch_downloaded(count=nil)
114
134
  raise NotImplementedError
115
135
  end
136
+
137
+ def initialized?
138
+ raise NotImplementedError
139
+ end
116
140
  end
117
141
  end
118
142
  end
@@ -5,7 +5,9 @@ module Scruber
5
5
 
6
6
  class Page < Scruber::QueueAdapters::AbstractAdapter::Page
7
7
  def save
8
- if self.fetched_at > 0
8
+ if self.processed_at.to_i > 0
9
+ nil
10
+ elsif self.fetched_at > 0
9
11
  @queue.add_downloaded self
10
12
  elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
11
13
  @queue.add_error_page self
@@ -73,6 +75,10 @@ module Scruber
73
75
  @error_pages -= [page]
74
76
  end
75
77
 
78
+ def initialized?
79
+ @queue.present? || @downloaded_pages.present? || @error_pages.present?
80
+ end
81
+
76
82
  end
77
83
  end
78
84
  end
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scruber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Goncharov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-03-15 00:00:00.000000000 Z
11
+ date: 2018-03-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus
@@ -171,6 +171,7 @@ files:
171
171
  - lib/scruber.rb
172
172
  - lib/scruber/app_searcher.rb
173
173
  - lib/scruber/cli.rb
174
+ - lib/scruber/cli/generators.rb
174
175
  - lib/scruber/cli/project_generator.rb
175
176
  - lib/scruber/cli/templates/Gemfile.tt
176
177
  - lib/scruber/cli/templates/application.tt
@@ -185,6 +186,8 @@ files:
185
186
  - lib/scruber/core/extensions/base.rb
186
187
  - lib/scruber/core/extensions/csv_output.rb
187
188
  - lib/scruber/core/extensions/loop.rb
189
+ - lib/scruber/core/extensions/parser_aliases.rb
190
+ - lib/scruber/core/extensions/queue_aliases.rb
188
191
  - lib/scruber/core/page_format.rb
189
192
  - lib/scruber/core/page_format/base.rb
190
193
  - lib/scruber/core/page_format/html.rb