scruber 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6aecb299dd5c70bfe864f65d2faed704cdc3cde0
4
- data.tar.gz: f0697ca03b0f1552e03fb1ee70d0265d5b8bed04
3
+ metadata.gz: 8cc15677a33ddfea6a5a02bc0051d16cffea104f
4
+ data.tar.gz: 858c94d1eab0bd11688f719f9a012ef93bf198e8
5
5
  SHA512:
6
- metadata.gz: 2f52170ba8af4152b88e694537fa9e55156fa6d082c1e9ec409ae46808dc9ca28ff47ed241e8072c0edbba13b3e0be9f00504aef4542da52c3a280c282d8df71
7
- data.tar.gz: 580d4c3840526dba3e62826433f8dddd09cdc42f569a2db05beb69b3428bfbd0fea92dc2faa0367175870327605f27f960e04be3d65d5c2dbd3c44c61719252d
6
+ metadata.gz: 410eb7d1f17efb474c9bbe5d39df02df4f52ef21994aebd8f4678ad86ef867da8a532cc1c96dc7803a531ea89282fcc5c59055323de74a6ea5f2719f6fe32154
7
+ data.tar.gz: 4e3f22d81e1467012a7f60d124816b5a0a7cd2e76fe5fe48aa477ccace7bf2c6d5a2079f5a6ddcde2b8fd061c0cf7bf6225cb9cf0b1c49d8a86e2d4a51d9dbdb
data/.gitignore CHANGED
@@ -13,3 +13,4 @@ todo
13
13
 
14
14
  # rspec failure tracking
15
15
  .rspec_status
16
+ *.gem
@@ -24,6 +24,13 @@ module Scruber
24
24
  end
25
25
  end
26
26
 
27
+ def find_scraper(name, app_path)
28
+ [
29
+ File.expand_path('../../scrapers/'+name+'.rb', app_path),
30
+ File.expand_path('../../scrapers/'+name, app_path),
31
+ ].find{|f| File.exists?(f) }
32
+ end
33
+
27
34
  def find_executable
28
35
  EXECUTABLES.find { |exe| File.file?(exe) }
29
36
  end
@@ -1,4 +1,4 @@
1
- Scruber.run do
1
+ Scruber.run :sample do
2
2
  queue.add "http://example.com"
3
3
 
4
4
  parser :seed do |page|
data/lib/scruber/cli.rb CHANGED
@@ -17,14 +17,15 @@ module Scruber
17
17
  def start(name)
18
18
  if defined?(APP_PATH)
19
19
  # raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
20
- raise ::Thor::Error, "ERROR: Scraper not found." unless File.exist?(File.expand_path('../../scrapers/'+name+'.rb', APP_PATH))
20
+ scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
21
+ raise ::Thor::Error, "ERROR: Scraper not found." if scraper_path.nil?
21
22
  say "booting..."
22
23
  require APP_PATH
23
24
  Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
24
25
  require i
25
26
  end
26
27
  say "starting #{name}"
27
- require File.expand_path('../../scrapers/'+name+'.rb', APP_PATH)
28
+ require scraper_path
28
29
  else
29
30
  Scruber::AppSearcher.exec_app(name)
30
31
  end
@@ -1,14 +1,15 @@
1
1
  module Scruber
2
2
  module Core
3
3
  class Crawler
4
- attr_reader :queue, :fetcher
4
+ attr_reader :queue, :fetcher, :scraper_name
5
5
 
6
- def initialize(options={})
6
+ def initialize(scraper_name, options={})
7
+ @scraper_name = scraper_name
7
8
  Scruber.configuration.merge_options(options)
8
9
  @callbacks_options = {}
9
10
  @callbacks = {}
10
11
  @on_complete_callbacks = {}
11
- @queue = Scruber::Queue.new
12
+ @queue = Scruber::Queue.new(scraper_name: scraper_name)
12
13
  @fetcher = Scruber::Fetcher.new
13
14
  load_extenstions
14
15
  end
@@ -39,8 +40,8 @@ module Scruber
39
40
 
40
41
  def method_missing(method_sym, *arguments, &block)
41
42
  Scruber::Core::Crawler._registered_method_missings.find do |(pattern, func)|
42
- if method_sym.to_s =~ pattern
43
- instance_exec method_sym, arguments, &(func)
43
+ if (scan_results = method_sym.to_s.scan(pattern)).present?
44
+ instance_exec method_sym, scan_results, arguments, &(func)
44
45
  true
45
46
  else
46
47
  false
@@ -17,8 +17,8 @@ module Scruber
17
17
  end
18
18
 
19
19
  def self.included(base)
20
- Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, args|
21
- file_id = meth.to_s.scan(/\Acsv_(\w+)_file\Z/).first.first.to_sym
20
+ Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, scan_results, args|
21
+ file_id = scan_results.first.first.to_sym
22
22
  path, options = args
23
23
  options = {} if options.nil?
24
24
  csv_file path, options.merge({file_id: file_id})
@@ -28,6 +28,8 @@ module Scruber
28
28
  def initialize(queue, url, options={})
29
29
  @queue = queue
30
30
  @url = url
31
+
32
+ options = options.with_indifferent_access
31
33
  @method = options.fetch(:method) { :get }
32
34
  @user_agent = options.fetch(:user_agent) { nil }
33
35
  @post_body = options.fetch(:post_body) { nil }
@@ -90,6 +92,10 @@ module Scruber
90
92
  instance_variable_get("@#{k.to_s}")
91
93
  end
92
94
 
95
+ def delete
96
+ raise NotImplementedError
97
+ end
98
+
93
99
  end
94
100
 
95
101
  def initialize(options={})
@@ -7,12 +7,16 @@ module Scruber
7
7
  def save
8
8
  if self.fetched_at > 0
9
9
  @queue.add_downloaded self
10
- elsif self.retry_count >= self.max_retry_times.to_i
10
+ elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
11
11
  @queue.add_error_page self
12
12
  else
13
13
  @queue.push self
14
14
  end
15
15
  end
16
+
17
+ def delete
18
+ @queue.delete self
19
+ end
16
20
  end
17
21
 
18
22
  def initialize(options={})
@@ -63,6 +67,12 @@ module Scruber
63
67
  @queue.count > 0 || @downloaded_pages.count > 0
64
68
  end
65
69
 
70
+ def delete(page)
71
+ @queue -= [page]
72
+ @downloaded_pages -= [page]
73
+ @error_pages -= [page]
74
+ end
75
+
66
76
  end
67
77
  end
68
78
  end
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/lib/scruber.rb CHANGED
@@ -52,10 +52,10 @@ module Scruber
52
52
  class << self
53
53
  attr_writer :configuration
54
54
 
55
- def run(options={}, &block)
55
+ def run(scraper_name, options={}, &block)
56
56
  raise "You need a block to build!" unless block_given?
57
57
 
58
- Core::Crawler.new(options).run(&block)
58
+ Core::Crawler.new(scraper_name, options).run(&block)
59
59
  end
60
60
 
61
61
  def configuration
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scruber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ivan Goncharov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-03-06 00:00:00.000000000 Z
11
+ date: 2018-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: typhoeus