scruber 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/scruber +2 -1
- data/lib/scruber.rb +4 -2
- data/lib/scruber/app_searcher.rb +2 -1
- data/lib/scruber/cli.rb +8 -5
- data/lib/scruber/cli/generators.rb +33 -0
- data/lib/scruber/cli/project_generator.rb +1 -1
- data/lib/scruber/cli/templates/scrapers/sample.tt +6 -4
- data/lib/scruber/core/crawler.rb +17 -9
- data/lib/scruber/core/extensions/parser_aliases.rb +24 -0
- data/lib/scruber/core/extensions/queue_aliases.rb +30 -0
- data/lib/scruber/core/page_format/xml.rb +1 -1
- data/lib/scruber/queue_adapters/abstract_adapter.rb +24 -0
- data/lib/scruber/queue_adapters/memory.rb +7 -1
- data/lib/scruber/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 937243b2503853e755c95800e0d34ae5ecb939a7
|
4
|
+
data.tar.gz: 1e2472ff1d1487da8df94292daa57a27d5b52c5a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a30b80a4301ca18b87e1c91217ebdfd881e13c511e643ce198e7e9f28984d271b0133b11a065b6d971690fca49b948b289147884eed7c01ee5bfc985d35cc69
|
7
|
+
data.tar.gz: 77201fa5bbde27dfac6f7205711b05a3af9f85ab0ad705c6d08af5ea6b55f2c18c37a4e80878e3e7de7280ad945d20aca79aedc179087036f7bf176ecf05f597
|
data/exe/scruber
CHANGED
data/lib/scruber.rb
CHANGED
@@ -23,6 +23,8 @@ require "scruber/core/page_format/html"
|
|
23
23
|
require "scruber/core/extensions/base"
|
24
24
|
require "scruber/core/extensions/loop"
|
25
25
|
require "scruber/core/extensions/csv_output"
|
26
|
+
require "scruber/core/extensions/queue_aliases"
|
27
|
+
require "scruber/core/extensions/parser_aliases"
|
26
28
|
|
27
29
|
# require "scruber/core/configuration"
|
28
30
|
# require "scruber/core/configuration"
|
@@ -52,10 +54,10 @@ module Scruber
|
|
52
54
|
class << self
|
53
55
|
attr_writer :configuration
|
54
56
|
|
55
|
-
def run(
|
57
|
+
def run(*args, &block)
|
56
58
|
raise "You need a block to build!" unless block_given?
|
57
59
|
|
58
|
-
Core::Crawler.new(
|
60
|
+
Core::Crawler.new(*args).run(&block)
|
59
61
|
end
|
60
62
|
|
61
63
|
def configuration
|
data/lib/scruber/app_searcher.rb
CHANGED
@@ -6,7 +6,7 @@ module Scruber
|
|
6
6
|
RUBY = Gem.ruby
|
7
7
|
EXECUTABLES = ["bin/scruber"]
|
8
8
|
|
9
|
-
def exec_app
|
9
|
+
def exec_app
|
10
10
|
original_cwd = Dir.pwd
|
11
11
|
|
12
12
|
loop do
|
@@ -22,6 +22,7 @@ module Scruber
|
|
22
22
|
# Otherwise keep moving upwards in search of an executable.
|
23
23
|
Dir.chdir("..")
|
24
24
|
end
|
25
|
+
true
|
25
26
|
end
|
26
27
|
|
27
28
|
def find_scraper(name, app_path)
|
data/lib/scruber/cli.rb
CHANGED
@@ -1,22 +1,23 @@
|
|
1
1
|
require "thor"
|
2
2
|
require "scruber"
|
3
3
|
require "scruber/cli/project_generator"
|
4
|
+
require "scruber/cli/generators"
|
4
5
|
require "scruber/app_searcher"
|
5
6
|
|
6
7
|
module Scruber
|
7
8
|
module CLI
|
8
9
|
|
9
10
|
class Root < Thor
|
10
|
-
|
11
|
+
def self.exit_on_failure?
|
11
12
|
true
|
12
13
|
end
|
13
14
|
|
14
|
-
register
|
15
|
+
register ProjectGenerator, "new", "new PATH", "Create new project"
|
16
|
+
register Generators, 'generate', 'generate [GENERATOR]', 'Generate something'
|
15
17
|
|
16
18
|
desc 'start', 'Run scraper'
|
17
19
|
def start(name)
|
18
20
|
if defined?(APP_PATH)
|
19
|
-
# raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
|
20
21
|
scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
|
21
22
|
raise ::Thor::Error, "ERROR: Scraper not found." if scraper_path.nil?
|
22
23
|
say "booting..."
|
@@ -24,10 +25,12 @@ module Scruber
|
|
24
25
|
Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
|
25
26
|
require i
|
26
27
|
end
|
27
|
-
|
28
|
+
ENV['SCRUBER_SCRAPER_NAME'] = File.basename(scraper_path).gsub(/\.rb\Z/, '').underscore
|
29
|
+
say "starting #{ENV['SCRUBER_SCRAPER_NAME']}"
|
30
|
+
|
28
31
|
require scraper_path
|
29
32
|
else
|
30
|
-
Scruber
|
33
|
+
raise ::Thor::Error, "ERROR: Scruber project not found."
|
31
34
|
end
|
32
35
|
end
|
33
36
|
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require "thor"
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Scruber
|
5
|
+
module CLI
|
6
|
+
class Generators < Thor
|
7
|
+
|
8
|
+
class ScraperGenerator < Thor::Group
|
9
|
+
include Thor::Actions
|
10
|
+
|
11
|
+
argument :name
|
12
|
+
|
13
|
+
def self.source_root
|
14
|
+
File.dirname(__FILE__) + '/templates'
|
15
|
+
end
|
16
|
+
|
17
|
+
def create_files
|
18
|
+
if defined?(APP_PATH)
|
19
|
+
scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
|
20
|
+
if scraper_path.present?
|
21
|
+
raise ::Thor::Error, "ERROR: Scraper already exists"
|
22
|
+
end
|
23
|
+
template 'scrapers/sample.tt', File.expand_path('../../scrapers/'+name+'.rb', APP_PATH)
|
24
|
+
else
|
25
|
+
raise ::Thor::Error, "ERROR: Scruber project not found."
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
register ScraperGenerator, 'scraper', 'scraper [NAME]', 'Generate scraper'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -25,9 +25,9 @@ module Scruber
|
|
25
25
|
template 'Gemfile.tt', path+'/Gemfile'
|
26
26
|
template 'gitignore.tt', path+'/.gitignore'
|
27
27
|
template 'bin/scruber.tt', path+'/bin/scruber'
|
28
|
+
chmod path+'/bin/scruber', '+x'
|
28
29
|
template 'application.tt', path+'/config/application.rb'
|
29
30
|
template 'boot.tt', path+'/config/boot.rb'
|
30
|
-
template 'boot.tt', path+'/config/boot.rb'
|
31
31
|
template 'initializers/proxies.tt', path+'/config/initializers/proxies.rb'
|
32
32
|
template 'initializers/user_agents.tt', path+'/config/initializers/user_agents.rb'
|
33
33
|
template 'scrapers/sample.tt', path+'/scrapers/sample.rb'
|
@@ -1,7 +1,9 @@
|
|
1
|
-
Scruber.run
|
2
|
-
|
1
|
+
Scruber.run do
|
2
|
+
get "http://..."
|
3
3
|
|
4
|
-
|
5
|
-
|
4
|
+
parse :html do |page, doc|
|
5
|
+
# page - queue page object
|
6
|
+
# doc - processed object, in this case Nokogiri::HTML(page.response_body) object
|
7
|
+
puts doc.at('title').text
|
6
8
|
end
|
7
9
|
end
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -3,8 +3,17 @@ module Scruber
|
|
3
3
|
class Crawler
|
4
4
|
attr_reader :queue, :fetcher, :scraper_name
|
5
5
|
|
6
|
-
def initialize(
|
7
|
-
|
6
|
+
def initialize(*args)
|
7
|
+
if args.first.is_a?(Hash)
|
8
|
+
scraper_name = nil
|
9
|
+
options = args.first
|
10
|
+
else
|
11
|
+
scraper_name, options = args
|
12
|
+
options ||= {}
|
13
|
+
end
|
14
|
+
@scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
|
15
|
+
raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
|
16
|
+
@scraper_name = @scraper_name.to_sym
|
8
17
|
Scruber.configuration.merge_options(options)
|
9
18
|
@callbacks_options = {}
|
10
19
|
@callbacks = {}
|
@@ -26,6 +35,7 @@ module Scruber
|
|
26
35
|
if @callbacks[page.page_type.to_sym]
|
27
36
|
processed_page = process_page(page, page.page_type.to_sym)
|
28
37
|
instance_exec page, processed_page, &(@callbacks[page.page_type.to_sym])
|
38
|
+
page.processed! unless page.sent_to_redownload?
|
29
39
|
end
|
30
40
|
end
|
31
41
|
end
|
@@ -39,14 +49,12 @@ module Scruber
|
|
39
49
|
end
|
40
50
|
|
41
51
|
def method_missing(method_sym, *arguments, &block)
|
42
|
-
Scruber::Core::Crawler._registered_method_missings.
|
52
|
+
Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
|
43
53
|
if (scan_results = method_sym.to_s.scan(pattern)).present?
|
44
|
-
instance_exec
|
45
|
-
true
|
46
|
-
else
|
47
|
-
false
|
54
|
+
return instance_exec(method_sym, scan_results, arguments+[block], &(func))
|
48
55
|
end
|
49
|
-
end
|
56
|
+
end
|
57
|
+
super
|
50
58
|
end
|
51
59
|
|
52
60
|
def respond_to?(method_sym, include_private = false)
|
@@ -81,7 +89,7 @@ module Scruber
|
|
81
89
|
end
|
82
90
|
|
83
91
|
def process_page(page, page_type)
|
84
|
-
page_format = @callbacks_options[page_type].fetch(:
|
92
|
+
page_format = @callbacks_options[page_type].fetch(:format){ nil }
|
85
93
|
Scruber::Core::PageFormat.process(page, page_format)
|
86
94
|
end
|
87
95
|
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module Extensions
|
4
|
+
class ParserAliases < Base
|
5
|
+
module CoreMethods
|
6
|
+
def parse(*args, &block)
|
7
|
+
page_format = args.shift
|
8
|
+
parser('seed', {format: page_format}, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.included(base)
|
12
|
+
Scruber::Core::Crawler.register_method_missing /\Aparse_(\w+)\Z/ do |meth, scan_results, args|
|
13
|
+
page_type = scan_results.first.first
|
14
|
+
page_format = args.first.is_a?(Symbol) ? args.shift : nil
|
15
|
+
block = args.shift
|
16
|
+
parser(page_type, {format: page_format}, &block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module Extensions
|
4
|
+
class QueueAliases < Base
|
5
|
+
module CoreMethods
|
6
|
+
%w(get post head).each do |meth|
|
7
|
+
define_method meth.to_sym do |url, options={}|
|
8
|
+
queue.add url, options.merge({method: meth.to_sym})
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.included(base)
|
13
|
+
Scruber::Core::Crawler.register_method_missing /\A(get|post|head)_(\w+)\Z/ do |m, scan_results, args|
|
14
|
+
meth, page_type = scan_results.first
|
15
|
+
url, options = args
|
16
|
+
options = {} if options.nil?
|
17
|
+
Scruber::Core::Crawler.class_eval do
|
18
|
+
define_method "#{meth}_#{page_type}".to_sym do |url, options={}|
|
19
|
+
queue.add url, options.merge({method: meth.to_sym, page_type: page_type})
|
20
|
+
end
|
21
|
+
end
|
22
|
+
queue.add url, options.merge({method: meth.to_sym, page_type: page_type})
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -53,6 +53,7 @@ module Scruber
|
|
53
53
|
|
54
54
|
@_fetcher_agent = false
|
55
55
|
@_proxy = false
|
56
|
+
@_redownload = false
|
56
57
|
end
|
57
58
|
|
58
59
|
def fetcher_agent
|
@@ -96,6 +97,25 @@ module Scruber
|
|
96
97
|
raise NotImplementedError
|
97
98
|
end
|
98
99
|
|
100
|
+
def processed!
|
101
|
+
@processed_at = Time.now.to_i
|
102
|
+
@_redownload = false
|
103
|
+
save
|
104
|
+
end
|
105
|
+
|
106
|
+
def redownload!
|
107
|
+
@_redownload = true
|
108
|
+
|
109
|
+
@processed_at = nil
|
110
|
+
@retry_count += 1
|
111
|
+
@fetched_at = 0
|
112
|
+
@response_body = nil
|
113
|
+
save
|
114
|
+
end
|
115
|
+
|
116
|
+
def sent_to_redownload?
|
117
|
+
@_redownload
|
118
|
+
end
|
99
119
|
end
|
100
120
|
|
101
121
|
def initialize(options={})
|
@@ -113,6 +133,10 @@ module Scruber
|
|
113
133
|
def fetch_downloaded(count=nil)
|
114
134
|
raise NotImplementedError
|
115
135
|
end
|
136
|
+
|
137
|
+
def initialized?
|
138
|
+
raise NotImplementedError
|
139
|
+
end
|
116
140
|
end
|
117
141
|
end
|
118
142
|
end
|
@@ -5,7 +5,9 @@ module Scruber
|
|
5
5
|
|
6
6
|
class Page < Scruber::QueueAdapters::AbstractAdapter::Page
|
7
7
|
def save
|
8
|
-
if self.
|
8
|
+
if self.processed_at.to_i > 0
|
9
|
+
nil
|
10
|
+
elsif self.fetched_at > 0
|
9
11
|
@queue.add_downloaded self
|
10
12
|
elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
|
11
13
|
@queue.add_error_page self
|
@@ -73,6 +75,10 @@ module Scruber
|
|
73
75
|
@error_pages -= [page]
|
74
76
|
end
|
75
77
|
|
78
|
+
def initialized?
|
79
|
+
@queue.present? || @downloaded_pages.present? || @error_pages.present?
|
80
|
+
end
|
81
|
+
|
76
82
|
end
|
77
83
|
end
|
78
84
|
end
|
data/lib/scruber/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-03-
|
11
|
+
date: 2018-03-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|
@@ -171,6 +171,7 @@ files:
|
|
171
171
|
- lib/scruber.rb
|
172
172
|
- lib/scruber/app_searcher.rb
|
173
173
|
- lib/scruber/cli.rb
|
174
|
+
- lib/scruber/cli/generators.rb
|
174
175
|
- lib/scruber/cli/project_generator.rb
|
175
176
|
- lib/scruber/cli/templates/Gemfile.tt
|
176
177
|
- lib/scruber/cli/templates/application.tt
|
@@ -185,6 +186,8 @@ files:
|
|
185
186
|
- lib/scruber/core/extensions/base.rb
|
186
187
|
- lib/scruber/core/extensions/csv_output.rb
|
187
188
|
- lib/scruber/core/extensions/loop.rb
|
189
|
+
- lib/scruber/core/extensions/parser_aliases.rb
|
190
|
+
- lib/scruber/core/extensions/queue_aliases.rb
|
188
191
|
- lib/scruber/core/page_format.rb
|
189
192
|
- lib/scruber/core/page_format/base.rb
|
190
193
|
- lib/scruber/core/page_format/html.rb
|