scruber 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6aecb299dd5c70bfe864f65d2faed704cdc3cde0
4
- data.tar.gz: f0697ca03b0f1552e03fb1ee70d0265d5b8bed04
3
+ metadata.gz: 8cc15677a33ddfea6a5a02bc0051d16cffea104f
4
+ data.tar.gz: 858c94d1eab0bd11688f719f9a012ef93bf198e8
5
5
  SHA512:
6
- metadata.gz: 2f52170ba8af4152b88e694537fa9e55156fa6d082c1e9ec409ae46808dc9ca28ff47ed241e8072c0edbba13b3e0be9f00504aef4542da52c3a280c282d8df71
7
- data.tar.gz: 580d4c3840526dba3e62826433f8dddd09cdc42f569a2db05beb69b3428bfbd0fea92dc2faa0367175870327605f27f960e04be3d65d5c2dbd3c44c61719252d
6
+ metadata.gz: 410eb7d1f17efb474c9bbe5d39df02df4f52ef21994aebd8f4678ad86ef867da8a532cc1c96dc7803a531ea89282fcc5c59055323de74a6ea5f2719f6fe32154
7
+ data.tar.gz: 4e3f22d81e1467012a7f60d124816b5a0a7cd2e76fe5fe48aa477ccace7bf2c6d5a2079f5a6ddcde2b8fd061c0cf7bf6225cb9cf0b1c49d8a86e2d4a51d9dbdb
data/.gitignore CHANGED
@@ -13,3 +13,4 @@ todo
13
13
 
14
14
  # rspec failure tracking
15
15
  .rspec_status
16
+ *.gem
@@ -24,6 +24,13 @@ module Scruber
24
24
  end
25
25
  end
26
26
 
27
+ def find_scraper(name, app_path)
28
+ [
29
+ File.expand_path('../../scrapers/'+name+'.rb', app_path),
30
+ File.expand_path('../../scrapers/'+name, app_path),
31
+ ].find{|f| File.exists?(f) }
32
+ end
33
+
27
34
  def find_executable
28
35
  EXECUTABLES.find { |exe| File.file?(exe) }
29
36
  end
@@ -1,4 +1,4 @@
1
- Scruber.run do
1
+ Scruber.run :sample do
2
2
  queue.add "http://example.com"
3
3
 
4
4
  parser :seed do |page|
data/lib/scruber/cli.rb CHANGED
@@ -17,14 +17,15 @@ module Scruber
17
17
  def start(name)
18
18
  if defined?(APP_PATH)
19
19
  # raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
20
- raise ::Thor::Error, "ERROR: Scraper not found." unless File.exist?(File.expand_path('../../scrapers/'+name+'.rb', APP_PATH))
20
+ scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
21
+ raise ::Thor::Error, "ERROR: Scraper not found." if scraper_path.nil?
21
22
  say "booting..."
22
23
  require APP_PATH
23
24
  Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
24
25
  require i
25
26
  end
26
27
  say "starting #{name}"
27
- require File.expand_path('../../scrapers/'+name+'.rb', APP_PATH)
28
+ require scraper_path
28
29
  else
29
30
  Scruber::AppSearcher.exec_app(name)
30
31
  end
@@ -1,14 +1,15 @@
1
1
  module Scruber
2
2
  module Core
3
3
  class Crawler
4
- attr_reader :queue, :fetcher
4
+ attr_reader :queue, :fetcher, :scraper_name
5
5
 
6
- def initialize(options={})
6
+ def initialize(scraper_name, options={})
7
+ @scraper_name = scraper_name
7
8
  Scruber.configuration.merge_options(options)
8
9
  @callbacks_options = {}
9
10
  @callbacks = {}
10
11
  @on_complete_callbacks = {}
11
- @queue = Scruber::Queue.new
12
+ @queue = Scruber::Queue.new(scraper_name: scraper_name)
12
13
  @fetcher = Scruber::Fetcher.new
13
14
  load_extenstions
14
15
  end
@@ -39,8 +40,8 @@ module Scruber
39
40
 
40
41
  def method_missing(method_sym, *arguments, &block)
41
42
  Scruber::Core::Crawler._registered_method_missings.find do |(pattern, func)|
42
- if method_sym.to_s =~ pattern
43
- instance_exec method_sym, arguments, &(func)
43
+ if (scan_results = method_sym.to_s.scan(pattern)).present?
44
+ instance_exec method_sym, scan_results, arguments, &(func)
44
45
  true
45
46
  else
46
47
  false
@@ -17,8 +17,8 @@ module Scruber
17
17
  end
18
18
 
19
19
  def self.included(base)
20
- Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, args|
21
- file_id = meth.to_s.scan(/\Acsv_(\w+)_file\Z/).first.first.to_sym
20
+ Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, scan_results, args|
21
+ file_id = scan_results.first.first.to_sym
22
22
  path, options = args
23
23
  options = {} if options.nil?
24
24
  csv_file path, options.merge({file_id: file_id})
@@ -28,6 +28,8 @@ module Scruber
28
28
  def initialize(queue, url, options={})
29
29
  @queue = queue
30
30
  @url = url
31
+
32
+ options = options.with_indifferent_access
31
33
  @method = options.fetch(:method) { :get }
32
34
  @user_agent = options.fetch(:user_agent) { nil }
33
35
  @post_body = options.fetch(:post_body) { nil }
@@ -90,6 +92,10 @@ module Scruber
90
92
  instance_variable_get("@#{k.to_s}")
91
93
  end
92
94
 
95
+ def delete
96
+ raise NotImplementedError
97
+ end
98
+
93
99
  end
94
100
 
95
101
  def initialize(options={})
@@ -7,12 +7,16 @@ module Scruber
7
7
  def save
8
8
  if self.fetched_at > 0
9
9
  @queue.add_downloaded self
10
- elsif self.retry_count >= self.max_retry_times.to_i
10
+ elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
11
11
  @queue.add_error_page self
12
12
  else
13
13
  @queue.push self
14
14
  end
15
15
  end
16
+
17
+ def delete
18
+ @queue.delete self
19
+ end
16
20
  end
17
21
 
18
22
  def initialize(options={})
@@ -63,6 +67,12 @@ module Scruber
63
67
  @queue.count > 0 || @downloaded_pages.count > 0
64
68
  end
65
69
 
70
+ def delete(page)
71
+ @queue -= [page]
72
+ @downloaded_pages -= [page]
73
+ @error_pages -= [page]
74
+ end
75
+
66
76
  end
67
77
  end
68
78
  end
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/lib/scruber.rb CHANGED
@@ -52,10 +52,10 @@ module Scruber
52
52
  class << self
53
53
  attr_writer :configuration
54
54
 
55
- def run(options={}, &block)
55
+ def run(scraper_name, options={}, &block)
56
56
  raise "You need a block to build!" unless block_given?
57
57
 
58
- Core::Crawler.new(options).run(&block)
58
+ Core::Crawler.new(scraper_name, options).run(&block)
59
59
  end
60
60
 
61
61
  def configuration
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scruber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Goncharov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-03-06 00:00:00.000000000 Z
11
+ date: 2018-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus