url_processor 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/url_processor/base.rb +56 -49
- data/lib/url_processor/version.rb +1 -1
- metadata +4 -4
data/lib/url_processor/base.rb
CHANGED
@@ -51,19 +51,23 @@ module UrlProcessor
|
|
51
51
|
# Output progress information
|
52
52
|
config.logger.info "PROCESSED: #{processed_links}, NEXT GROUP SIZE: #{group.size}".yellow
|
53
53
|
|
54
|
+
yield group
|
55
|
+
|
54
56
|
# for debuggin purposes we do not want to process everything
|
55
57
|
if config.debug && processed_links >= config.batch_size
|
56
58
|
config.logger.debug "FINISHED first batch (#{@batch_size} records), exiting".yellow
|
57
59
|
return
|
58
60
|
end
|
59
61
|
|
60
|
-
group.each do |element|
|
61
|
-
yield element
|
62
|
-
end
|
63
62
|
end
|
64
63
|
else
|
64
|
+
elements = []
|
65
65
|
collection.each do |element|
|
66
|
-
|
66
|
+
elements << element
|
67
|
+
if elements.size % batch_size == 0
|
68
|
+
yield elements
|
69
|
+
elements = elements.clear
|
70
|
+
end
|
67
71
|
end
|
68
72
|
end
|
69
73
|
end
|
@@ -72,53 +76,56 @@ module UrlProcessor
|
|
72
76
|
processed_links = 0
|
73
77
|
hydra = Typhoeus::Hydra.new(max_concurrency: config.max_concurrency, max_total_connections: config.max_total_connections)
|
74
78
|
|
75
|
-
find_in_batches(config.links.call, config.batch_size) do |
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
79
|
+
find_in_batches(config.links.call, config.batch_size) do |group|
|
80
|
+
|
81
|
+
group.each do |link|
|
82
|
+
# any custom pre-processing
|
83
|
+
pre_process_link(link)
|
84
|
+
|
85
|
+
if link.urls.empty?
|
86
|
+
# In the event that we have a link that actually has no urls associated with it
|
87
|
+
report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls
|
88
|
+
else
|
89
|
+
# Each record has 2 urls associated with it, process each separately
|
90
|
+
link.urls.each do |url|
|
91
|
+
config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow
|
92
|
+
|
93
|
+
link_request = config.new_link_request.call(
|
94
|
+
url[:url],
|
95
|
+
followlocation: true,
|
96
|
+
method: :head,
|
97
|
+
ssl_verifypeer: false,
|
98
|
+
ssl_verifyhost: 2,
|
99
|
+
cookiefile: config.cookies_file,
|
100
|
+
cookiejar: config.cookies_file,
|
101
|
+
link_id: link.id,
|
102
|
+
url_type_code: url[:url_type_code],
|
103
|
+
timeout: config.max_timeout,
|
104
|
+
connecttimeout: config.max_timeout,
|
105
|
+
max_retries: config.max_retries,
|
106
|
+
forbid_reuse: 1,
|
107
|
+
nosignal: 1
|
108
|
+
)
|
109
|
+
|
110
|
+
link_request.on_complete do |response|
|
111
|
+
processed_links += 1
|
112
|
+
|
113
|
+
if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request?
|
114
|
+
config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow
|
115
|
+
hydra.queue response.request
|
116
|
+
elsif response.return_code == :got_nothing && response.request.options[:method] != :get
|
117
|
+
config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow
|
118
|
+
|
119
|
+
# set to GET request since HEAD may fail in some cases
|
120
|
+
response.request.options[:method] = :get
|
121
|
+
hydra.queue response.request
|
122
|
+
else
|
123
|
+
config.process_response.call response
|
124
|
+
end
|
118
125
|
end
|
119
|
-
end
|
120
126
|
|
121
|
-
|
127
|
+
hydra.queue link_request
|
128
|
+
end
|
122
129
|
end
|
123
130
|
end
|
124
131
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-04-
|
12
|
+
date: 2014-04-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: typhoeus
|
@@ -186,7 +186,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
186
186
|
version: '0'
|
187
187
|
segments:
|
188
188
|
- 0
|
189
|
-
hash:
|
189
|
+
hash: 1450876520014070707
|
190
190
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
191
191
|
none: false
|
192
192
|
requirements:
|
@@ -195,7 +195,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
195
195
|
version: '0'
|
196
196
|
segments:
|
197
197
|
- 0
|
198
|
-
hash:
|
198
|
+
hash: 1450876520014070707
|
199
199
|
requirements: []
|
200
200
|
rubyforge_project:
|
201
201
|
rubygems_version: 1.8.23.2
|