scruber 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/lib/scruber/app_searcher.rb +7 -0
- data/lib/scruber/cli/templates/scrapers/sample.tt +1 -1
- data/lib/scruber/cli.rb +3 -2
- data/lib/scruber/core/crawler.rb +6 -5
- data/lib/scruber/core/extensions/csv_output.rb +2 -2
- data/lib/scruber/queue_adapters/abstract_adapter.rb +6 -0
- data/lib/scruber/queue_adapters/memory.rb +11 -1
- data/lib/scruber/version.rb +1 -1
- data/lib/scruber.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8cc15677a33ddfea6a5a02bc0051d16cffea104f
|
4
|
+
data.tar.gz: 858c94d1eab0bd11688f719f9a012ef93bf198e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 410eb7d1f17efb474c9bbe5d39df02df4f52ef21994aebd8f4678ad86ef867da8a532cc1c96dc7803a531ea89282fcc5c59055323de74a6ea5f2719f6fe32154
|
7
|
+
data.tar.gz: 4e3f22d81e1467012a7f60d124816b5a0a7cd2e76fe5fe48aa477ccace7bf2c6d5a2079f5a6ddcde2b8fd061c0cf7bf6225cb9cf0b1c49d8a86e2d4a51d9dbdb
|
data/.gitignore
CHANGED
data/lib/scruber/app_searcher.rb
CHANGED
@@ -24,6 +24,13 @@ module Scruber
|
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
+
def find_scraper(name, app_path)
|
28
|
+
[
|
29
|
+
File.expand_path('../../scrapers/'+name+'.rb', app_path),
|
30
|
+
File.expand_path('../../scrapers/'+name, app_path),
|
31
|
+
].find{|f| File.exists?(f) }
|
32
|
+
end
|
33
|
+
|
27
34
|
def find_executable
|
28
35
|
EXECUTABLES.find { |exe| File.file?(exe) }
|
29
36
|
end
|
data/lib/scruber/cli.rb
CHANGED
@@ -17,14 +17,15 @@ module Scruber
|
|
17
17
|
def start(name)
|
18
18
|
if defined?(APP_PATH)
|
19
19
|
# raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
|
20
|
-
|
20
|
+
scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
|
21
|
+
raise ::Thor::Error, "ERROR: Scraper not found." if scraper_path.nil?
|
21
22
|
say "booting..."
|
22
23
|
require APP_PATH
|
23
24
|
Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
|
24
25
|
require i
|
25
26
|
end
|
26
27
|
say "starting #{name}"
|
27
|
-
require
|
28
|
+
require scraper_path
|
28
29
|
else
|
29
30
|
Scruber::AppSearcher.exec_app(name)
|
30
31
|
end
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
class Crawler
|
4
|
-
attr_reader :queue, :fetcher
|
4
|
+
attr_reader :queue, :fetcher, :scraper_name
|
5
5
|
|
6
|
-
def initialize(options={})
|
6
|
+
def initialize(scraper_name, options={})
|
7
|
+
@scraper_name = scraper_name
|
7
8
|
Scruber.configuration.merge_options(options)
|
8
9
|
@callbacks_options = {}
|
9
10
|
@callbacks = {}
|
10
11
|
@on_complete_callbacks = {}
|
11
|
-
@queue = Scruber::Queue.new
|
12
|
+
@queue = Scruber::Queue.new(scraper_name: scraper_name)
|
12
13
|
@fetcher = Scruber::Fetcher.new
|
13
14
|
load_extenstions
|
14
15
|
end
|
@@ -39,8 +40,8 @@ module Scruber
|
|
39
40
|
|
40
41
|
def method_missing(method_sym, *arguments, &block)
|
41
42
|
Scruber::Core::Crawler._registered_method_missings.find do |(pattern, func)|
|
42
|
-
if method_sym.to_s
|
43
|
-
instance_exec method_sym, arguments, &(func)
|
43
|
+
if (scan_results = method_sym.to_s.scan(pattern)).present?
|
44
|
+
instance_exec method_sym, scan_results, arguments, &(func)
|
44
45
|
true
|
45
46
|
else
|
46
47
|
false
|
@@ -17,8 +17,8 @@ module Scruber
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def self.included(base)
|
20
|
-
Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, args|
|
21
|
-
file_id =
|
20
|
+
Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, scan_results, args|
|
21
|
+
file_id = scan_results.first.first.to_sym
|
22
22
|
path, options = args
|
23
23
|
options = {} if options.nil?
|
24
24
|
csv_file path, options.merge({file_id: file_id})
|
@@ -28,6 +28,8 @@ module Scruber
|
|
28
28
|
def initialize(queue, url, options={})
|
29
29
|
@queue = queue
|
30
30
|
@url = url
|
31
|
+
|
32
|
+
options = options.with_indifferent_access
|
31
33
|
@method = options.fetch(:method) { :get }
|
32
34
|
@user_agent = options.fetch(:user_agent) { nil }
|
33
35
|
@post_body = options.fetch(:post_body) { nil }
|
@@ -90,6 +92,10 @@ module Scruber
|
|
90
92
|
instance_variable_get("@#{k.to_s}")
|
91
93
|
end
|
92
94
|
|
95
|
+
def delete
|
96
|
+
raise NotImplementedError
|
97
|
+
end
|
98
|
+
|
93
99
|
end
|
94
100
|
|
95
101
|
def initialize(options={})
|
@@ -7,12 +7,16 @@ module Scruber
|
|
7
7
|
def save
|
8
8
|
if self.fetched_at > 0
|
9
9
|
@queue.add_downloaded self
|
10
|
-
elsif self.retry_count >= self.max_retry_times.to_i
|
10
|
+
elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
|
11
11
|
@queue.add_error_page self
|
12
12
|
else
|
13
13
|
@queue.push self
|
14
14
|
end
|
15
15
|
end
|
16
|
+
|
17
|
+
def delete
|
18
|
+
@queue.delete self
|
19
|
+
end
|
16
20
|
end
|
17
21
|
|
18
22
|
def initialize(options={})
|
@@ -63,6 +67,12 @@ module Scruber
|
|
63
67
|
@queue.count > 0 || @downloaded_pages.count > 0
|
64
68
|
end
|
65
69
|
|
70
|
+
def delete(page)
|
71
|
+
@queue -= [page]
|
72
|
+
@downloaded_pages -= [page]
|
73
|
+
@error_pages -= [page]
|
74
|
+
end
|
75
|
+
|
66
76
|
end
|
67
77
|
end
|
68
78
|
end
|
data/lib/scruber/version.rb
CHANGED
data/lib/scruber.rb
CHANGED
@@ -52,10 +52,10 @@ module Scruber
|
|
52
52
|
class << self
|
53
53
|
attr_writer :configuration
|
54
54
|
|
55
|
-
def run(options={}, &block)
|
55
|
+
def run(scraper_name, options={}, &block)
|
56
56
|
raise "You need a block to build!" unless block_given?
|
57
57
|
|
58
|
-
Core::Crawler.new(options).run(&block)
|
58
|
+
Core::Crawler.new(scraper_name, options).run(&block)
|
59
59
|
end
|
60
60
|
|
61
61
|
def configuration
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-03-
|
11
|
+
date: 2018-03-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|