scruber 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/exe/scruber +2 -1
- data/lib/scruber.rb +4 -2
- data/lib/scruber/app_searcher.rb +2 -1
- data/lib/scruber/cli.rb +8 -5
- data/lib/scruber/cli/generators.rb +33 -0
- data/lib/scruber/cli/project_generator.rb +1 -1
- data/lib/scruber/cli/templates/scrapers/sample.tt +6 -4
- data/lib/scruber/core/crawler.rb +17 -9
- data/lib/scruber/core/extensions/parser_aliases.rb +24 -0
- data/lib/scruber/core/extensions/queue_aliases.rb +30 -0
- data/lib/scruber/core/page_format/xml.rb +1 -1
- data/lib/scruber/queue_adapters/abstract_adapter.rb +24 -0
- data/lib/scruber/queue_adapters/memory.rb +7 -1
- data/lib/scruber/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 937243b2503853e755c95800e0d34ae5ecb939a7
|
4
|
+
data.tar.gz: 1e2472ff1d1487da8df94292daa57a27d5b52c5a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a30b80a4301ca18b87e1c91217ebdfd881e13c511e643ce198e7e9f28984d271b0133b11a065b6d971690fca49b948b289147884eed7c01ee5bfc985d35cc69
|
7
|
+
data.tar.gz: 77201fa5bbde27dfac6f7205711b05a3af9f85ab0ad705c6d08af5ea6b55f2c18c37a4e80878e3e7de7280ad945d20aca79aedc179087036f7bf176ecf05f597
|
data/exe/scruber
CHANGED
data/lib/scruber.rb
CHANGED
@@ -23,6 +23,8 @@ require "scruber/core/page_format/html"
|
|
23
23
|
require "scruber/core/extensions/base"
|
24
24
|
require "scruber/core/extensions/loop"
|
25
25
|
require "scruber/core/extensions/csv_output"
|
26
|
+
require "scruber/core/extensions/queue_aliases"
|
27
|
+
require "scruber/core/extensions/parser_aliases"
|
26
28
|
|
27
29
|
# require "scruber/core/configuration"
|
28
30
|
# require "scruber/core/configuration"
|
@@ -52,10 +54,10 @@ module Scruber
|
|
52
54
|
class << self
|
53
55
|
attr_writer :configuration
|
54
56
|
|
55
|
-
def run(
|
57
|
+
def run(*args, &block)
|
56
58
|
raise "You need a block to build!" unless block_given?
|
57
59
|
|
58
|
-
Core::Crawler.new(
|
60
|
+
Core::Crawler.new(*args).run(&block)
|
59
61
|
end
|
60
62
|
|
61
63
|
def configuration
|
data/lib/scruber/app_searcher.rb
CHANGED
@@ -6,7 +6,7 @@ module Scruber
|
|
6
6
|
RUBY = Gem.ruby
|
7
7
|
EXECUTABLES = ["bin/scruber"]
|
8
8
|
|
9
|
-
def exec_app
|
9
|
+
def exec_app
|
10
10
|
original_cwd = Dir.pwd
|
11
11
|
|
12
12
|
loop do
|
@@ -22,6 +22,7 @@ module Scruber
|
|
22
22
|
# Otherwise keep moving upwards in search of an executable.
|
23
23
|
Dir.chdir("..")
|
24
24
|
end
|
25
|
+
true
|
25
26
|
end
|
26
27
|
|
27
28
|
def find_scraper(name, app_path)
|
data/lib/scruber/cli.rb
CHANGED
@@ -1,22 +1,23 @@
|
|
1
1
|
require "thor"
|
2
2
|
require "scruber"
|
3
3
|
require "scruber/cli/project_generator"
|
4
|
+
require "scruber/cli/generators"
|
4
5
|
require "scruber/app_searcher"
|
5
6
|
|
6
7
|
module Scruber
|
7
8
|
module CLI
|
8
9
|
|
9
10
|
class Root < Thor
|
10
|
-
|
11
|
+
def self.exit_on_failure?
|
11
12
|
true
|
12
13
|
end
|
13
14
|
|
14
|
-
register
|
15
|
+
register ProjectGenerator, "new", "new PATH", "Create new project"
|
16
|
+
register Generators, 'generate', 'generate [GENERATOR]', 'Generate something'
|
15
17
|
|
16
18
|
desc 'start', 'Run scraper'
|
17
19
|
def start(name)
|
18
20
|
if defined?(APP_PATH)
|
19
|
-
# raise ::Thor::Error, "ERROR: Scruber project not found." unless File.exist?(File.expand_path('config/application', Dir.pwd))
|
20
21
|
scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
|
21
22
|
raise ::Thor::Error, "ERROR: Scraper not found." if scraper_path.nil?
|
22
23
|
say "booting..."
|
@@ -24,10 +25,12 @@ module Scruber
|
|
24
25
|
Dir[File.expand_path('../initializers/*.rb', APP_PATH)].sort.each do |i|
|
25
26
|
require i
|
26
27
|
end
|
27
|
-
|
28
|
+
ENV['SCRUBER_SCRAPER_NAME'] = File.basename(scraper_path).gsub(/\.rb\Z/, '').underscore
|
29
|
+
say "starting #{ENV['SCRUBER_SCRAPER_NAME']}"
|
30
|
+
|
28
31
|
require scraper_path
|
29
32
|
else
|
30
|
-
Scruber
|
33
|
+
raise ::Thor::Error, "ERROR: Scruber project not found."
|
31
34
|
end
|
32
35
|
end
|
33
36
|
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require "thor"
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Scruber
|
5
|
+
module CLI
|
6
|
+
class Generators < Thor
|
7
|
+
|
8
|
+
class ScraperGenerator < Thor::Group
|
9
|
+
include Thor::Actions
|
10
|
+
|
11
|
+
argument :name
|
12
|
+
|
13
|
+
def self.source_root
|
14
|
+
File.dirname(__FILE__) + '/templates'
|
15
|
+
end
|
16
|
+
|
17
|
+
def create_files
|
18
|
+
if defined?(APP_PATH)
|
19
|
+
scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
|
20
|
+
if scraper_path.present?
|
21
|
+
raise ::Thor::Error, "ERROR: Scraper already exists"
|
22
|
+
end
|
23
|
+
template 'scrapers/sample.tt', File.expand_path('../../scrapers/'+name+'.rb', APP_PATH)
|
24
|
+
else
|
25
|
+
raise ::Thor::Error, "ERROR: Scruber project not found."
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
register ScraperGenerator, 'scraper', 'scraper [NAME]', 'Generate scraper'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -25,9 +25,9 @@ module Scruber
|
|
25
25
|
template 'Gemfile.tt', path+'/Gemfile'
|
26
26
|
template 'gitignore.tt', path+'/.gitignore'
|
27
27
|
template 'bin/scruber.tt', path+'/bin/scruber'
|
28
|
+
chmod path+'/bin/scruber', '+x'
|
28
29
|
template 'application.tt', path+'/config/application.rb'
|
29
30
|
template 'boot.tt', path+'/config/boot.rb'
|
30
|
-
template 'boot.tt', path+'/config/boot.rb'
|
31
31
|
template 'initializers/proxies.tt', path+'/config/initializers/proxies.rb'
|
32
32
|
template 'initializers/user_agents.tt', path+'/config/initializers/user_agents.rb'
|
33
33
|
template 'scrapers/sample.tt', path+'/scrapers/sample.rb'
|
@@ -1,7 +1,9 @@
|
|
1
|
-
Scruber.run
|
2
|
-
|
1
|
+
Scruber.run do
|
2
|
+
get "http://..."
|
3
3
|
|
4
|
-
|
5
|
-
|
4
|
+
parse :html do |page, doc|
|
5
|
+
# page - queue page object
|
6
|
+
# doc - processed object, in this case Nokogiri::HTML(page.response_body) object
|
7
|
+
puts doc.at('title').text
|
6
8
|
end
|
7
9
|
end
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -3,8 +3,17 @@ module Scruber
|
|
3
3
|
class Crawler
|
4
4
|
attr_reader :queue, :fetcher, :scraper_name
|
5
5
|
|
6
|
-
def initialize(
|
7
|
-
|
6
|
+
def initialize(*args)
|
7
|
+
if args.first.is_a?(Hash)
|
8
|
+
scraper_name = nil
|
9
|
+
options = args.first
|
10
|
+
else
|
11
|
+
scraper_name, options = args
|
12
|
+
options ||= {}
|
13
|
+
end
|
14
|
+
@scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
|
15
|
+
raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
|
16
|
+
@scraper_name = @scraper_name.to_sym
|
8
17
|
Scruber.configuration.merge_options(options)
|
9
18
|
@callbacks_options = {}
|
10
19
|
@callbacks = {}
|
@@ -26,6 +35,7 @@ module Scruber
|
|
26
35
|
if @callbacks[page.page_type.to_sym]
|
27
36
|
processed_page = process_page(page, page.page_type.to_sym)
|
28
37
|
instance_exec page, processed_page, &(@callbacks[page.page_type.to_sym])
|
38
|
+
page.processed! unless page.sent_to_redownload?
|
29
39
|
end
|
30
40
|
end
|
31
41
|
end
|
@@ -39,14 +49,12 @@ module Scruber
|
|
39
49
|
end
|
40
50
|
|
41
51
|
def method_missing(method_sym, *arguments, &block)
|
42
|
-
Scruber::Core::Crawler._registered_method_missings.
|
52
|
+
Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
|
43
53
|
if (scan_results = method_sym.to_s.scan(pattern)).present?
|
44
|
-
instance_exec
|
45
|
-
true
|
46
|
-
else
|
47
|
-
false
|
54
|
+
return instance_exec(method_sym, scan_results, arguments+[block], &(func))
|
48
55
|
end
|
49
|
-
end
|
56
|
+
end
|
57
|
+
super
|
50
58
|
end
|
51
59
|
|
52
60
|
def respond_to?(method_sym, include_private = false)
|
@@ -81,7 +89,7 @@ module Scruber
|
|
81
89
|
end
|
82
90
|
|
83
91
|
def process_page(page, page_type)
|
84
|
-
page_format = @callbacks_options[page_type].fetch(:
|
92
|
+
page_format = @callbacks_options[page_type].fetch(:format){ nil }
|
85
93
|
Scruber::Core::PageFormat.process(page, page_format)
|
86
94
|
end
|
87
95
|
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module Extensions
|
4
|
+
class ParserAliases < Base
|
5
|
+
module CoreMethods
|
6
|
+
def parse(*args, &block)
|
7
|
+
page_format = args.shift
|
8
|
+
parser('seed', {format: page_format}, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.included(base)
|
12
|
+
Scruber::Core::Crawler.register_method_missing /\Aparse_(\w+)\Z/ do |meth, scan_results, args|
|
13
|
+
page_type = scan_results.first.first
|
14
|
+
page_format = args.first.is_a?(Symbol) ? args.shift : nil
|
15
|
+
block = args.shift
|
16
|
+
parser(page_type, {format: page_format}, &block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module Extensions
|
4
|
+
class QueueAliases < Base
|
5
|
+
module CoreMethods
|
6
|
+
%w(get post head).each do |meth|
|
7
|
+
define_method meth.to_sym do |url, options={}|
|
8
|
+
queue.add url, options.merge({method: meth.to_sym})
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.included(base)
|
13
|
+
Scruber::Core::Crawler.register_method_missing /\A(get|post|head)_(\w+)\Z/ do |m, scan_results, args|
|
14
|
+
meth, page_type = scan_results.first
|
15
|
+
url, options = args
|
16
|
+
options = {} if options.nil?
|
17
|
+
Scruber::Core::Crawler.class_eval do
|
18
|
+
define_method "#{meth}_#{page_type}".to_sym do |url, options={}|
|
19
|
+
queue.add url, options.merge({method: meth.to_sym, page_type: page_type})
|
20
|
+
end
|
21
|
+
end
|
22
|
+
queue.add url, options.merge({method: meth.to_sym, page_type: page_type})
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -53,6 +53,7 @@ module Scruber
|
|
53
53
|
|
54
54
|
@_fetcher_agent = false
|
55
55
|
@_proxy = false
|
56
|
+
@_redownload = false
|
56
57
|
end
|
57
58
|
|
58
59
|
def fetcher_agent
|
@@ -96,6 +97,25 @@ module Scruber
|
|
96
97
|
raise NotImplementedError
|
97
98
|
end
|
98
99
|
|
100
|
+
def processed!
|
101
|
+
@processed_at = Time.now.to_i
|
102
|
+
@_redownload = false
|
103
|
+
save
|
104
|
+
end
|
105
|
+
|
106
|
+
def redownload!
|
107
|
+
@_redownload = true
|
108
|
+
|
109
|
+
@processed_at = nil
|
110
|
+
@retry_count += 1
|
111
|
+
@fetched_at = 0
|
112
|
+
@response_body = nil
|
113
|
+
save
|
114
|
+
end
|
115
|
+
|
116
|
+
def sent_to_redownload?
|
117
|
+
@_redownload
|
118
|
+
end
|
99
119
|
end
|
100
120
|
|
101
121
|
def initialize(options={})
|
@@ -113,6 +133,10 @@ module Scruber
|
|
113
133
|
def fetch_downloaded(count=nil)
|
114
134
|
raise NotImplementedError
|
115
135
|
end
|
136
|
+
|
137
|
+
def initialized?
|
138
|
+
raise NotImplementedError
|
139
|
+
end
|
116
140
|
end
|
117
141
|
end
|
118
142
|
end
|
@@ -5,7 +5,9 @@ module Scruber
|
|
5
5
|
|
6
6
|
class Page < Scruber::QueueAdapters::AbstractAdapter::Page
|
7
7
|
def save
|
8
|
-
if self.
|
8
|
+
if self.processed_at.to_i > 0
|
9
|
+
nil
|
10
|
+
elsif self.fetched_at > 0
|
9
11
|
@queue.add_downloaded self
|
10
12
|
elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
|
11
13
|
@queue.add_error_page self
|
@@ -73,6 +75,10 @@ module Scruber
|
|
73
75
|
@error_pages -= [page]
|
74
76
|
end
|
75
77
|
|
78
|
+
def initialized?
|
79
|
+
@queue.present? || @downloaded_pages.present? || @error_pages.present?
|
80
|
+
end
|
81
|
+
|
76
82
|
end
|
77
83
|
end
|
78
84
|
end
|
data/lib/scruber/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-03-
|
11
|
+
date: 2018-03-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|
@@ -171,6 +171,7 @@ files:
|
|
171
171
|
- lib/scruber.rb
|
172
172
|
- lib/scruber/app_searcher.rb
|
173
173
|
- lib/scruber/cli.rb
|
174
|
+
- lib/scruber/cli/generators.rb
|
174
175
|
- lib/scruber/cli/project_generator.rb
|
175
176
|
- lib/scruber/cli/templates/Gemfile.tt
|
176
177
|
- lib/scruber/cli/templates/application.tt
|
@@ -185,6 +186,8 @@ files:
|
|
185
186
|
- lib/scruber/core/extensions/base.rb
|
186
187
|
- lib/scruber/core/extensions/csv_output.rb
|
187
188
|
- lib/scruber/core/extensions/loop.rb
|
189
|
+
- lib/scruber/core/extensions/parser_aliases.rb
|
190
|
+
- lib/scruber/core/extensions/queue_aliases.rb
|
188
191
|
- lib/scruber/core/page_format.rb
|
189
192
|
- lib/scruber/core/page_format/base.rb
|
190
193
|
- lib/scruber/core/page_format/html.rb
|