bitreaper 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/bitreaper +1 -1
- data/lib/bitreaper.rb +85 -22
- data/lib/bitreaper/helpers.rb +10 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6f76eb586fc7ef623380a46183b99b55dc84eb8a0e620b78798e279be916c880
|
4
|
+
data.tar.gz: d25938d1f72c42fa225816f98e66cdfbffcc15b3f829327d3b97203af9dbe06d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2dd848cd9aee975fc662785dd8417beca9afb855a09afe9fed8e9b44965ae023c2faaf4706e48f1fecfab153d7daa710de245aaca57bc65d43bfc9bde5cb32bb
|
7
|
+
data.tar.gz: 3cb64e5a593c9184232f1101303a4d370537e7167dbf052ffcf0c351d7ea8d38fe0b824c2691ec7b764fa1dbb2921529f9ab07900d75d93c2856a42b28465326
|
data/bin/bitreaper
CHANGED
@@ -93,7 +93,7 @@ if $inputFile!=""
|
|
93
93
|
}
|
94
94
|
else
|
95
95
|
if not $verbose
|
96
|
-
Parallel.each_with_index($urls, in_threads:
|
96
|
+
Parallel.each_with_index($urls, in_threads: 6, progress: " ► Processing... "){|url,i|
|
97
97
|
br = BitReaper.new(url,$parser,i)
|
98
98
|
$store << br.process()
|
99
99
|
}
|
data/lib/bitreaper.rb
CHANGED
@@ -20,7 +20,7 @@ require 'sdl4r'
|
|
20
20
|
|
21
21
|
require_relative 'bitreaper/helpers.rb'
|
22
22
|
|
23
|
-
$bitreaper_version = "0.1.
|
23
|
+
$bitreaper_version = "0.1.4"
|
24
24
|
|
25
25
|
##
|
26
26
|
# This is the main Web Scraper object. It is through a `BitScraper` instance
|
@@ -54,7 +54,7 @@ class BitReaper
|
|
54
54
|
|
55
55
|
def self.getParser(file)
|
56
56
|
parserFile = File.read(file)
|
57
|
-
parserFile = parserFile.gsub(/([\w]+)
|
57
|
+
parserFile = parserFile.gsub(/([\w]+)\!/,'\1=on')
|
58
58
|
if $verbose
|
59
59
|
puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
|
60
60
|
puts ""
|
@@ -73,7 +73,9 @@ class BitReaper
|
|
73
73
|
def download(url,withProgress=true)
|
74
74
|
printProgress(@url,@index,0) if withProgress
|
75
75
|
|
76
|
-
|
76
|
+
html = Nokogiri::HTML(open(url))
|
77
|
+
|
78
|
+
return html
|
77
79
|
end
|
78
80
|
|
79
81
|
##
|
@@ -108,6 +110,8 @@ class BitReaper
|
|
108
110
|
when "download"
|
109
111
|
val = val
|
110
112
|
val.downloadAs($outputDest,(param.is_a? String) ? param : nil)
|
113
|
+
when "exclude"
|
114
|
+
val = false
|
111
115
|
end
|
112
116
|
return val
|
113
117
|
end
|
@@ -126,9 +130,9 @@ class BitReaper
|
|
126
130
|
when "join"
|
127
131
|
val = val.join(param)
|
128
132
|
when "first"
|
129
|
-
val = val.first
|
133
|
+
val = param==true ? val.first : val.first(param)
|
130
134
|
when "last"
|
131
|
-
val = val.last
|
135
|
+
val = param==true ? val.last : val.last(param)
|
132
136
|
when "index"
|
133
137
|
val = val[param.to_i]
|
134
138
|
when "select.include"
|
@@ -143,10 +147,42 @@ class BitReaper
|
|
143
147
|
else
|
144
148
|
val = val.select{|r| r==param }
|
145
149
|
end
|
150
|
+
when "exclude"
|
151
|
+
val = false
|
146
152
|
end
|
147
153
|
return val
|
148
154
|
end
|
149
155
|
|
156
|
+
##
|
157
|
+
# Process Hash value using attribute
|
158
|
+
#
|
159
|
+
# @param [String] attrb The attribute to be processed
|
160
|
+
# @param [Array] val The value to processed
|
161
|
+
# @param [String] param The attribute's param (if any)
|
162
|
+
#
|
163
|
+
# @return [String,Array] The result of the operation
|
164
|
+
|
165
|
+
def processHashValue(attrb,val,param)
|
166
|
+
case attrb
|
167
|
+
when "list"
|
168
|
+
val = squish(val)
|
169
|
+
# toret = []
|
170
|
+
# list = val.first[1]
|
171
|
+
# list.each_with_index{|l,i|
|
172
|
+
# dict = {}
|
173
|
+
# val.keys.each{|key|
|
174
|
+
# if val[key].is_a? Array
|
175
|
+
# if i<val[key].count
|
176
|
+
# dict[key] = val[key][i]
|
177
|
+
# end
|
178
|
+
# end
|
179
|
+
# }
|
180
|
+
# toret << dict
|
181
|
+
# }
|
182
|
+
# val = toret
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
150
186
|
##
|
151
187
|
# Process parsed values using set of attributes
|
152
188
|
#
|
@@ -157,8 +193,14 @@ class BitReaper
|
|
157
193
|
|
158
194
|
def processValues(values,attrbs)
|
159
195
|
# check if we have a single value or an array of values
|
160
|
-
|
161
|
-
|
196
|
+
if values.is_a? Nokogiri::XML::NodeSet
|
197
|
+
# it is a nodeset, so let's extract the .content property
|
198
|
+
ret = (values.count==1) ? values[0].content
|
199
|
+
: values.map{|v| v.content}
|
200
|
+
else
|
201
|
+
# not a nodeset (perhaps a hash of values?)
|
202
|
+
ret = values
|
203
|
+
end
|
162
204
|
|
163
205
|
# no attributes, just return it
|
164
206
|
return ret if attrbs.size==0
|
@@ -168,15 +210,21 @@ class BitReaper
|
|
168
210
|
# get params if we have multiple params; or not
|
169
211
|
param = (arg.include? "||") ? (arg.split("||").map{|a| Liquid::Template.parse(a).render(@store) })
|
170
212
|
: Liquid::Template.parse(arg).render(@store)
|
213
|
+
else
|
214
|
+
param = arg
|
171
215
|
end
|
172
216
|
|
173
217
|
if ret.is_a? String
|
174
218
|
# if our value is a String, process it accordingly
|
175
219
|
ret = self.processStringValue(attrb,ret,param)
|
176
|
-
|
220
|
+
elsif ret.is_a? Array
|
177
221
|
# it's an array of values, so look for array-operating attributes
|
178
222
|
ret = self.processArrayValue(attrb,ret,param)
|
179
|
-
|
223
|
+
elsif ret.is_a? Hash
|
224
|
+
# it's a value hash, so process it accordingly
|
225
|
+
ret = self.processHashValue(attrb,ret,param)
|
226
|
+
else
|
227
|
+
## Wtf is that?
|
180
228
|
end
|
181
229
|
}
|
182
230
|
|
@@ -194,26 +242,41 @@ class BitReaper
|
|
194
242
|
def processNode(noko,node,store,level=0)
|
195
243
|
node.children.each{|child|
|
196
244
|
command = child.namespace
|
197
|
-
tag = child.name
|
245
|
+
tag = Liquid::Template.parse(child.name).render(@store)
|
198
246
|
pattern = child.values[0]
|
199
247
|
attrs = child.attributes
|
200
248
|
|
201
|
-
if
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
249
|
+
if not command==""
|
250
|
+
case tag
|
251
|
+
when "fetch"
|
252
|
+
gotoUrl = Liquid::Template.parse(pattern).render(@store)
|
253
|
+
br = BitReaper.new(gotoUrl,child)
|
254
|
+
store.merge! br.process()
|
207
255
|
end
|
208
256
|
else
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
257
|
+
if child.children.count==0
|
258
|
+
# no children, so it's a "get"
|
259
|
+
values = noko.search(pattern)
|
260
|
+
|
261
|
+
if values.count>0
|
262
|
+
processed = self.processValues(values,attrs)
|
263
|
+
if processed!=false
|
264
|
+
store[tag] = processed
|
265
|
+
end
|
266
|
+
end
|
213
267
|
else
|
214
|
-
|
268
|
+
# it's a "section"
|
269
|
+
store[tag] = {}
|
270
|
+
|
271
|
+
if pattern.nil?
|
272
|
+
subnoko = noko
|
273
|
+
else
|
274
|
+
subnoko = noko.search(pattern)
|
275
|
+
end
|
276
|
+
|
277
|
+
processNode(subnoko,child,store[tag],level+1)
|
278
|
+
store[tag] = self.processValues(store[tag],attrs)
|
215
279
|
end
|
216
|
-
processNode(subnoko,child,store[tag],level+1)
|
217
280
|
end
|
218
281
|
}
|
219
282
|
end
|
data/lib/bitreaper/helpers.rb
CHANGED
@@ -48,6 +48,16 @@ class String
|
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
51
|
+
def squish(ha)
|
52
|
+
h = ha
|
53
|
+
h.each{ |key,val|
|
54
|
+
if not val.nil? and val.is_a? Hash
|
55
|
+
h[key] = squish(val)
|
56
|
+
end
|
57
|
+
}
|
58
|
+
h.values.then { |a, *b| a.zip *b }.map { |e| (h.keys.zip e).to_h }
|
59
|
+
end
|
60
|
+
|
51
61
|
## Core
|
52
62
|
|
53
63
|
def printLogo
|