scruber 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/lib/scruber/app_searcher.rb +7 -0
- data/lib/scruber/cli/templates/scrapers/sample.tt +1 -1
- data/lib/scruber/cli.rb +3 -2
- data/lib/scruber/core/crawler.rb +6 -5
- data/lib/scruber/core/extensions/csv_output.rb +2 -2
- data/lib/scruber/queue_adapters/abstract_adapter.rb +6 -0
- data/lib/scruber/queue_adapters/memory.rb +11 -1
- data/lib/scruber/version.rb +1 -1
- data/lib/scruber.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8cc15677a33ddfea6a5a02bc0051d16cffea104f
|
4
|
+
data.tar.gz: 858c94d1eab0bd11688f719f9a012ef93bf198e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 410eb7d1f17efb474c9bbe5d39df02df4f52ef21994aebd8f4678ad86ef867da8a532cc1c96dc7803a531ea89282fcc5c59055323de74a6ea5f2719f6fe32154
|
7
|
+
data.tar.gz: 4e3f22d81e1467012a7f60d124816b5a0a7cd2e76fe5fe48aa477ccace7bf2c6d5a2079f5a6ddcde2b8fd061c0cf7bf6225cb9cf0b1c49d8a86e2d4a51d9dbdb
|
data/.gitignore
CHANGED
data/lib/scruber/app_searcher.rb
CHANGED
@@ -24,6 +24,13 @@ module Scruber
|
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
+
def find_scraper(name, app_path)
|
28
|
+
[
|
29
|
+
File.expand_path('../../scrapers/'+name+'.rb', app_path),
|
30
|
+
File.expand_path('../../scrapers/'+name, app_path),
|
31
|
+
].find{|f| File.exists?(f) }
|
32
|
+
end
|
33
|
+
|
27
34
|
def find_executable
|
28
35
|
EXECUTABLES.find { |exe| File.file?(exe) }
|
29
36
|
end
|
data/lib/scruber/cli.rb
CHANGED
@@ -17,14 +17,15 @@ module Scruber
|
|
17
17
|
def start(name)
|
18
18
|
if defined?(APP_PATH)
|
19
19
|
# raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
|
20
|
-
|
20
|
+
scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
|
21
|
+
raise ::Thor::Error, "ERROR: Scraper not found." if scraper_path.nil?
|
21
22
|
say "booting..."
|
22
23
|
require APP_PATH
|
23
24
|
Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
|
24
25
|
require i
|
25
26
|
end
|
26
27
|
say "starting #{name}"
|
27
|
-
require
|
28
|
+
require scraper_path
|
28
29
|
else
|
29
30
|
Scruber::AppSearcher.exec_app(name)
|
30
31
|
end
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
class Crawler
|
4
|
-
attr_reader :queue, :fetcher
|
4
|
+
attr_reader :queue, :fetcher, :scraper_name
|
5
5
|
|
6
|
-
def initialize(options={})
|
6
|
+
def initialize(scraper_name, options={})
|
7
|
+
@scraper_name = scraper_name
|
7
8
|
Scruber.configuration.merge_options(options)
|
8
9
|
@callbacks_options = {}
|
9
10
|
@callbacks = {}
|
10
11
|
@on_complete_callbacks = {}
|
11
|
-
@queue = Scruber::Queue.new
|
12
|
+
@queue = Scruber::Queue.new(scraper_name: scraper_name)
|
12
13
|
@fetcher = Scruber::Fetcher.new
|
13
14
|
load_extenstions
|
14
15
|
end
|
@@ -39,8 +40,8 @@ module Scruber
|
|
39
40
|
|
40
41
|
def method_missing(method_sym, *arguments, &block)
|
41
42
|
Scruber::Core::Crawler._registered_method_missings.find do |(pattern, func)|
|
42
|
-
if method_sym.to_s
|
43
|
-
instance_exec method_sym, arguments, &(func)
|
43
|
+
if (scan_results = method_sym.to_s.scan(pattern)).present?
|
44
|
+
instance_exec method_sym, scan_results, arguments, &(func)
|
44
45
|
true
|
45
46
|
else
|
46
47
|
false
|
@@ -17,8 +17,8 @@ module Scruber
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def self.included(base)
|
20
|
-
Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, args|
|
21
|
-
file_id =
|
20
|
+
Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, scan_results, args|
|
21
|
+
file_id = scan_results.first.first.to_sym
|
22
22
|
path, options = args
|
23
23
|
options = {} if options.nil?
|
24
24
|
csv_file path, options.merge({file_id: file_id})
|
@@ -28,6 +28,8 @@ module Scruber
|
|
28
28
|
def initialize(queue, url, options={})
|
29
29
|
@queue = queue
|
30
30
|
@url = url
|
31
|
+
|
32
|
+
options = options.with_indifferent_access
|
31
33
|
@method = options.fetch(:method) { :get }
|
32
34
|
@user_agent = options.fetch(:user_agent) { nil }
|
33
35
|
@post_body = options.fetch(:post_body) { nil }
|
@@ -90,6 +92,10 @@ module Scruber
|
|
90
92
|
instance_variable_get("@#{k.to_s}")
|
91
93
|
end
|
92
94
|
|
95
|
+
def delete
|
96
|
+
raise NotImplementedError
|
97
|
+
end
|
98
|
+
|
93
99
|
end
|
94
100
|
|
95
101
|
def initialize(options={})
|
@@ -7,12 +7,16 @@ module Scruber
|
|
7
7
|
def save
|
8
8
|
if self.fetched_at > 0
|
9
9
|
@queue.add_downloaded self
|
10
|
-
elsif self.retry_count >= self.max_retry_times.to_i
|
10
|
+
elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
|
11
11
|
@queue.add_error_page self
|
12
12
|
else
|
13
13
|
@queue.push self
|
14
14
|
end
|
15
15
|
end
|
16
|
+
|
17
|
+
def delete
|
18
|
+
@queue.delete self
|
19
|
+
end
|
16
20
|
end
|
17
21
|
|
18
22
|
def initialize(options={})
|
@@ -63,6 +67,12 @@ module Scruber
|
|
63
67
|
@queue.count > 0 || @downloaded_pages.count > 0
|
64
68
|
end
|
65
69
|
|
70
|
+
def delete(page)
|
71
|
+
@queue -= [page]
|
72
|
+
@downloaded_pages -= [page]
|
73
|
+
@error_pages -= [page]
|
74
|
+
end
|
75
|
+
|
66
76
|
end
|
67
77
|
end
|
68
78
|
end
|
data/lib/scruber/version.rb
CHANGED
data/lib/scruber.rb
CHANGED
@@ -52,10 +52,10 @@ module Scruber
|
|
52
52
|
class << self
|
53
53
|
attr_writer :configuration
|
54
54
|
|
55
|
-
def run(options={}, &block)
|
55
|
+
def run(scraper_name, options={}, &block)
|
56
56
|
raise "You need a block to build!" unless block_given?
|
57
57
|
|
58
|
-
Core::Crawler.new(options).run(&block)
|
58
|
+
Core::Crawler.new(scraper_name, options).run(&block)
|
59
59
|
end
|
60
60
|
|
61
61
|
def configuration
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-03-
|
11
|
+
date: 2018-03-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|