url_processor 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/url_processor/base.rb +56 -49
- data/lib/url_processor/version.rb +1 -1
- metadata +4 -4
data/lib/url_processor/base.rb
CHANGED
@@ -51,19 +51,23 @@ module UrlProcessor
|
|
51
51
|
# Output progress information
|
52
52
|
config.logger.info "PROCESSED: #{processed_links}, NEXT GROUP SIZE: #{group.size}".yellow
|
53
53
|
|
54
|
+
yield group
|
55
|
+
|
54
56
|
# for debuggin purposes we do not want to process everything
|
55
57
|
if config.debug && processed_links >= config.batch_size
|
56
58
|
config.logger.debug "FINISHED first batch (#{@batch_size} records), exiting".yellow
|
57
59
|
return
|
58
60
|
end
|
59
61
|
|
60
|
-
group.each do |element|
|
61
|
-
yield element
|
62
|
-
end
|
63
62
|
end
|
64
63
|
else
|
64
|
+
elements = []
|
65
65
|
collection.each do |element|
|
66
|
-
|
66
|
+
elements << element
|
67
|
+
if elements.size % batch_size == 0
|
68
|
+
yield elements
|
69
|
+
elements = elements.clear
|
70
|
+
end
|
67
71
|
end
|
68
72
|
end
|
69
73
|
end
|
@@ -72,53 +76,56 @@ module UrlProcessor
|
|
72
76
|
processed_links = 0
|
73
77
|
hydra = Typhoeus::Hydra.new(max_concurrency: config.max_concurrency, max_total_connections: config.max_total_connections)
|
74
78
|
|
75
|
-
find_in_batches(config.links.call, config.batch_size) do |
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
79
|
+
find_in_batches(config.links.call, config.batch_size) do |group|
|
80
|
+
|
81
|
+
group.each do |link|
|
82
|
+
# any custom pre-processing
|
83
|
+
pre_process_link(link)
|
84
|
+
|
85
|
+
if link.urls.empty?
|
86
|
+
# In the event that we have a link that actually has no urls associated with it
|
87
|
+
report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls
|
88
|
+
else
|
89
|
+
# Each record has 2 urls associated with it, process each separately
|
90
|
+
link.urls.each do |url|
|
91
|
+
config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow
|
92
|
+
|
93
|
+
link_request = config.new_link_request.call(
|
94
|
+
url[:url],
|
95
|
+
followlocation: true,
|
96
|
+
method: :head,
|
97
|
+
ssl_verifypeer: false,
|
98
|
+
ssl_verifyhost: 2,
|
99
|
+
cookiefile: config.cookies_file,
|
100
|
+
cookiejar: config.cookies_file,
|
101
|
+
link_id: link.id,
|
102
|
+
url_type_code: url[:url_type_code],
|
103
|
+
timeout: config.max_timeout,
|
104
|
+
connecttimeout: config.max_timeout,
|
105
|
+
max_retries: config.max_retries,
|
106
|
+
forbid_reuse: 1,
|
107
|
+
nosignal: 1
|
108
|
+
)
|
109
|
+
|
110
|
+
link_request.on_complete do |response|
|
111
|
+
processed_links += 1
|
112
|
+
|
113
|
+
if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request?
|
114
|
+
config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow
|
115
|
+
hydra.queue response.request
|
116
|
+
elsif response.return_code == :got_nothing && response.request.options[:method] != :get
|
117
|
+
config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow
|
118
|
+
|
119
|
+
# set to GET request since HEAD may fail in some cases
|
120
|
+
response.request.options[:method] = :get
|
121
|
+
hydra.queue response.request
|
122
|
+
else
|
123
|
+
config.process_response.call response
|
124
|
+
end
|
118
125
|
end
|
119
|
-
end
|
120
126
|
|
121
|
-
|
127
|
+
hydra.queue link_request
|
128
|
+
end
|
122
129
|
end
|
123
130
|
end
|
124
131
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-04-
|
12
|
+
date: 2014-04-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: typhoeus
|
@@ -186,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
186
186
|
version: '0'
|
187
187
|
segments:
|
188
188
|
- 0
|
189
|
-
hash:
|
189
|
+
hash: 1450876520014070707
|
190
190
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
191
191
|
none: false
|
192
192
|
requirements:
|
@@ -195,7 +195,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
195
195
|
version: '0'
|
196
196
|
segments:
|
197
197
|
- 0
|
198
|
-
hash:
|
198
|
+
hash: 1450876520014070707
|
199
199
|
requirements: []
|
200
200
|
rubyforge_project:
|
201
201
|
rubygems_version: 1.8.23.2
|