scruber 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8cc15677a33ddfea6a5a02bc0051d16cffea104f
4
- data.tar.gz: 858c94d1eab0bd11688f719f9a012ef93bf198e8
3
+ metadata.gz: 937243b2503853e755c95800e0d34ae5ecb939a7
4
+ data.tar.gz: 1e2472ff1d1487da8df94292daa57a27d5b52c5a
5
5
  SHA512:
6
- metadata.gz: 410eb7d1f17efb474c9bbe5d39df02df4f52ef21994aebd8f4678ad86ef867da8a532cc1c96dc7803a531ea89282fcc5c59055323de74a6ea5f2719f6fe32154
7
- data.tar.gz: 4e3f22d81e1467012a7f60d124816b5a0a7cd2e76fe5fe48aa477ccace7bf2c6d5a2079f5a6ddcde2b8fd061c0cf7bf6225cb9cf0b1c49d8a86e2d4a51d9dbdb
6
+ metadata.gz: 2a30b80a4301ca18b87e1c91217ebdfd881e13c511e643ce198e7e9f28984d271b0133b11a065b6d971690fca49b948b289147884eed7c01ee5bfc985d35cc69
7
+ data.tar.gz: 77201fa5bbde27dfac6f7205711b05a3af9f85ab0ad705c6d08af5ea6b55f2c18c37a4e80878e3e7de7280ad945d20aca79aedc179087036f7bf176ecf05f597
data/exe/scruber CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
  require "scruber/cli"
3
+ require "scruber/app_searcher"
3
4
 
4
- Scruber::CLI::Root.start(ARGV)
5
+ Scruber::CLI::Root.start(ARGV) unless Scruber::AppSearcher.exec_app
data/lib/scruber.rb CHANGED
@@ -23,6 +23,8 @@ require "scruber/core/page_format/html"
23
23
  require "scruber/core/extensions/base"
24
24
  require "scruber/core/extensions/loop"
25
25
  require "scruber/core/extensions/csv_output"
26
+ require "scruber/core/extensions/queue_aliases"
27
+ require "scruber/core/extensions/parser_aliases"
26
28
 
27
29
  # require "scruber/core/configuration"
28
30
  # require "scruber/core/configuration"
@@ -52,10 +54,10 @@ module Scruber
52
54
  class << self
53
55
  attr_writer :configuration
54
56
 
55
- def run(scraper_name, options={}, &block)
57
+ def run(*args, &block)
56
58
  raise "You need a block to build!" unless block_given?
57
59
 
58
- Core::Crawler.new(scraper_name, options).run(&block)
60
+ Core::Crawler.new(*args).run(&block)
59
61
  end
60
62
 
61
63
  def configuration
@@ -6,7 +6,7 @@ module Scruber
6
6
  RUBY = Gem.ruby
7
7
  EXECUTABLES = ["bin/scruber"]
8
8
 
9
- def exec_app(name)
9
+ def exec_app
10
10
  original_cwd = Dir.pwd
11
11
 
12
12
  loop do
@@ -22,6 +22,7 @@ module Scruber
22
22
  # Otherwise keep moving upwards in search of an executable.
23
23
  Dir.chdir("..")
24
24
  end
25
+ true
25
26
  end
26
27
 
27
28
  def find_scraper(name, app_path)
data/lib/scruber/cli.rb CHANGED
@@ -1,22 +1,23 @@
1
1
  require "thor"
2
2
  require "scruber"
3
3
  require "scruber/cli/project_generator"
4
+ require "scruber/cli/generators"
4
5
  require "scruber/app_searcher"
5
6
 
6
7
  module Scruber
7
8
  module CLI
8
9
 
9
10
  class Root < Thor
10
- def self.exit_on_failure?
11
+ def self.exit_on_failure?
11
12
  true
12
13
  end
13
14
 
14
- register(ProjectGenerator, "new", "new PATH", "Create new project")
15
+ register ProjectGenerator, "new", "new PATH", "Create new project"
16
+ register Generators, 'generate', 'generate [GENERATOR]', 'Generate something'
15
17
 
16
18
  desc 'start', 'Run scraper'
17
19
  def start(name)
18
20
  if defined?(APP_PATH)
19
- # raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
20
21
  scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
21
22
  raise ::Thor::Error, "ERROR: Scraper not found." if scraper_path.nil?
22
23
  say "booting..."
@@ -24,10 +25,12 @@ module Scruber
24
25
  Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
25
26
  require i
26
27
  end
27
- say "starting #{name}"
28
+ ENV['SCRUBER_SCRAPER_NAME'] = File.basename(scraper_path).gsub(/\.rb\Z/, '').underscore
29
+ say "starting #{ENV['SCRUBER_SCRAPER_NAME']}"
30
+
28
31
  require scraper_path
29
32
  else
30
- Scruber::AppSearcher.exec_app(name)
33
+ raise ::Thor::Error, "ERROR: Scruber project not found."
31
34
  end
32
35
  end
33
36
 
@@ -0,0 +1,33 @@
1
+ require "thor"
2
+ require 'fileutils'
3
+
4
+ module Scruber
5
+ module CLI
6
+ class Generators < Thor
7
+
8
+ class ScraperGenerator < Thor::Group
9
+ include Thor::Actions
10
+
11
+ argument :name
12
+
13
+ def self.source_root
14
+ File.dirname(__FILE__) + '/templates'
15
+ end
16
+
17
+ def create_files
18
+ if defined?(APP_PATH)
19
+ scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
20
+ if scraper_path.present?
21
+ raise ::Thor::Error, "ERROR: Scraper already exists"
22
+ end
23
+ template 'scrapers/sample.tt', File.expand_path('../../scrapers/'+name+'.rb', APP_PATH)
24
+ else
25
+ raise ::Thor::Error, "ERROR: Scruber project not found."
26
+ end
27
+ end
28
+ end
29
+
30
+ register ScraperGenerator, 'scraper', 'scraper [NAME]', 'Generate scraper'
31
+ end
32
+ end
33
+ end
@@ -25,9 +25,9 @@ module Scruber
25
25
  template 'Gemfile.tt', path+'/Gemfile'
26
26
  template 'gitignore.tt', path+'/.gitignore'
27
27
  template 'bin/scruber.tt', path+'/bin/scruber'
28
+ chmod path+'/bin/scruber', '+x'
28
29
  template 'application.tt', path+'/config/application.rb'
29
30
  template 'boot.tt', path+'/config/boot.rb'
30
- template 'boot.tt', path+'/config/boot.rb'
31
31
  template 'initializers/proxies.tt', path+'/config/initializers/proxies.rb'
32
32
  template 'initializers/user_agents.tt', path+'/config/initializers/user_agents.rb'
33
33
  template 'scrapers/sample.tt', path+'/scrapers/sample.rb'
@@ -1,7 +1,9 @@
1
- Scruber.run :sample do
2
- queue.add "http://example.com"
1
+ Scruber.run do
2
+ get "http://..."
3
3
 
4
- parser :seed do |page|
5
- puts page.response_body
4
+ parse :html do |page, doc|
5
+ # page - queue page object
6
+ # doc - processed object, in this case Nokogiri::HTML(page.response_body) object
7
+ puts doc.at('title').text
6
8
  end
7
9
  end
@@ -3,8 +3,17 @@ module Scruber
3
3
  class Crawler
4
4
  attr_reader :queue, :fetcher, :scraper_name
5
5
 
6
- def initialize(scraper_name, options={})
7
- @scraper_name = scraper_name
6
+ def initialize(*args)
7
+ if args.first.is_a?(Hash)
8
+ scraper_name = nil
9
+ options = args.first
10
+ else
11
+ scraper_name, options = args
12
+ options ||= {}
13
+ end
14
+ @scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
15
+ raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
16
+ @scraper_name = @scraper_name.to_sym
8
17
  Scruber.configuration.merge_options(options)
9
18
  @callbacks_options = {}
10
19
  @callbacks = {}
@@ -26,6 +35,7 @@ module Scruber
26
35
  if @callbacks[page.page_type.to_sym]
27
36
  processed_page = process_page(page, page.page_type.to_sym)
28
37
  instance_exec page, processed_page, &(@callbacks[page.page_type.to_sym])
38
+ page.processed! unless page.sent_to_redownload?
29
39
  end
30
40
  end
31
41
  end
@@ -39,14 +49,12 @@ module Scruber
39
49
  end
40
50
 
41
51
  def method_missing(method_sym, *arguments, &block)
42
- Scruber::Core::Crawler._registered_method_missings.find do |(pattern, func)|
52
+ Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
43
53
  if (scan_results = method_sym.to_s.scan(pattern)).present?
44
- instance_exec method_sym, scan_results, arguments, &(func)
45
- true
46
- else
47
- false
54
+ return instance_exec(method_sym, scan_results, arguments+[block], &(func))
48
55
  end
49
- end || super
56
+ end
57
+ super
50
58
  end
51
59
 
52
60
  def respond_to?(method_sym, include_private = false)
@@ -81,7 +89,7 @@ module Scruber
81
89
  end
82
90
 
83
91
  def process_page(page, page_type)
84
- page_format = @callbacks_options[page_type].fetch(:page_format){ nil }
92
+ page_format = @callbacks_options[page_type].fetch(:format){ nil }
85
93
  Scruber::Core::PageFormat.process(page, page_format)
86
94
  end
87
95
 
@@ -0,0 +1,24 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class ParserAliases < Base
5
+ module CoreMethods
6
+ def parse(*args, &block)
7
+ page_format = args.shift
8
+ parser('seed', {format: page_format}, &block)
9
+ end
10
+
11
+ def self.included(base)
12
+ Scruber::Core::Crawler.register_method_missing /\Aparse_(\w+)\Z/ do |meth, scan_results, args|
13
+ page_type = scan_results.first.first
14
+ page_format = args.first.is_a?(Symbol) ? args.shift : nil
15
+ block = args.shift
16
+ parser(page_type, {format: page_format}, &block)
17
+ end
18
+ end
19
+ end
20
+
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,30 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ class QueueAliases < Base
5
+ module CoreMethods
6
+ %w(get post head).each do |meth|
7
+ define_method meth.to_sym do |url, options={}|
8
+ queue.add url, options.merge({method: meth.to_sym})
9
+ end
10
+ end
11
+
12
+ def self.included(base)
13
+ Scruber::Core::Crawler.register_method_missing /\A(get|post|head)_(\w+)\Z/ do |m, scan_results, args|
14
+ meth, page_type = scan_results.first
15
+ url, options = args
16
+ options = {} if options.nil?
17
+ Scruber::Core::Crawler.class_eval do
18
+ define_method "#{meth}_#{page_type}".to_sym do |url, options={}|
19
+ queue.add url, options.merge({method: meth.to_sym, page_type: page_type})
20
+ end
21
+ end
22
+ queue.add url, options.merge({method: meth.to_sym, page_type: page_type})
23
+ end
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
30
+ end
@@ -3,7 +3,7 @@ module Scruber
3
3
  module PageFormat
4
4
  class XML < Base
5
5
  def self.process(page)
6
- Nokogiri.parse(page.response_body) rescue nil
6
+ Nokogiri::XML(page.response_body) rescue nil
7
7
  end
8
8
  end
9
9
  end
@@ -53,6 +53,7 @@ module Scruber
53
53
 
54
54
  @_fetcher_agent = false
55
55
  @_proxy = false
56
+ @_redownload = false
56
57
  end
57
58
 
58
59
  def fetcher_agent
@@ -96,6 +97,25 @@ module Scruber
96
97
  raise NotImplementedError
97
98
  end
98
99
 
100
+ def processed!
101
+ @processed_at = Time.now.to_i
102
+ @_redownload = false
103
+ save
104
+ end
105
+
106
+ def redownload!
107
+ @_redownload = true
108
+
109
+ @processed_at = nil
110
+ @retry_count += 1
111
+ @fetched_at = 0
112
+ @response_body = nil
113
+ save
114
+ end
115
+
116
+ def sent_to_redownload?
117
+ @_redownload
118
+ end
99
119
  end
100
120
 
101
121
  def initialize(options={})
@@ -113,6 +133,10 @@ module Scruber
113
133
  def fetch_downloaded(count=nil)
114
134
  raise NotImplementedError
115
135
  end
136
+
137
+ def initialized?
138
+ raise NotImplementedError
139
+ end
116
140
  end
117
141
  end
118
142
  end
@@ -5,7 +5,9 @@ module Scruber
5
5
 
6
6
  class Page < Scruber::QueueAdapters::AbstractAdapter::Page
7
7
  def save
8
- if self.fetched_at > 0
8
+ if self.processed_at.to_i > 0
9
+ nil
10
+ elsif self.fetched_at > 0
9
11
  @queue.add_downloaded self
10
12
  elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
11
13
  @queue.add_error_page self
@@ -73,6 +75,10 @@ module Scruber
73
75
  @error_pages -= [page]
74
76
  end
75
77
 
78
+ def initialized?
79
+ @queue.present? || @downloaded_pages.present? || @error_pages.present?
80
+ end
81
+
76
82
  end
77
83
  end
78
84
  end
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scruber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Goncharov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-03-15 00:00:00.000000000 Z
11
+ date: 2018-03-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus
@@ -171,6 +171,7 @@ files:
171
171
  - lib/scruber.rb
172
172
  - lib/scruber/app_searcher.rb
173
173
  - lib/scruber/cli.rb
174
+ - lib/scruber/cli/generators.rb
174
175
  - lib/scruber/cli/project_generator.rb
175
176
  - lib/scruber/cli/templates/Gemfile.tt
176
177
  - lib/scruber/cli/templates/application.tt
@@ -185,6 +186,8 @@ files:
185
186
  - lib/scruber/core/extensions/base.rb
186
187
  - lib/scruber/core/extensions/csv_output.rb
187
188
  - lib/scruber/core/extensions/loop.rb
189
+ - lib/scruber/core/extensions/parser_aliases.rb
190
+ - lib/scruber/core/extensions/queue_aliases.rb
188
191
  - lib/scruber/core/page_format.rb
189
192
  - lib/scruber/core/page_format/base.rb
190
193
  - lib/scruber/core/page_format/html.rb