bitreaper 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/bitreaper +1 -1
- data/lib/bitreaper.rb +85 -22
- data/lib/bitreaper/helpers.rb +10 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6f76eb586fc7ef623380a46183b99b55dc84eb8a0e620b78798e279be916c880
|
4
|
+
data.tar.gz: d25938d1f72c42fa225816f98e66cdfbffcc15b3f829327d3b97203af9dbe06d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2dd848cd9aee975fc662785dd8417beca9afb855a09afe9fed8e9b44965ae023c2faaf4706e48f1fecfab153d7daa710de245aaca57bc65d43bfc9bde5cb32bb
|
7
|
+
data.tar.gz: 3cb64e5a593c9184232f1101303a4d370537e7167dbf052ffcf0c351d7ea8d38fe0b824c2691ec7b764fa1dbb2921529f9ab07900d75d93c2856a42b28465326
|
data/bin/bitreaper
CHANGED
@@ -93,7 +93,7 @@ if $inputFile!=""
|
|
93
93
|
}
|
94
94
|
else
|
95
95
|
if not $verbose
|
96
|
-
Parallel.each_with_index($urls, in_threads:
|
96
|
+
Parallel.each_with_index($urls, in_threads: 6, progress: " ► Processing... "){|url,i|
|
97
97
|
br = BitReaper.new(url,$parser,i)
|
98
98
|
$store << br.process()
|
99
99
|
}
|
data/lib/bitreaper.rb
CHANGED
@@ -20,7 +20,7 @@ require 'sdl4r'
|
|
20
20
|
|
21
21
|
require_relative 'bitreaper/helpers.rb'
|
22
22
|
|
23
|
-
$bitreaper_version = "0.1.
|
23
|
+
$bitreaper_version = "0.1.4"
|
24
24
|
|
25
25
|
##
|
26
26
|
# This is the main Web Scraper object. It is through a `BitScraper` instance
|
@@ -54,7 +54,7 @@ class BitReaper
|
|
54
54
|
|
55
55
|
def self.getParser(file)
|
56
56
|
parserFile = File.read(file)
|
57
|
-
parserFile = parserFile.gsub(/([\w]+)
|
57
|
+
parserFile = parserFile.gsub(/([\w]+)\!/,'\1=on')
|
58
58
|
if $verbose
|
59
59
|
puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
|
60
60
|
puts ""
|
@@ -73,7 +73,9 @@ class BitReaper
|
|
73
73
|
def download(url,withProgress=true)
|
74
74
|
printProgress(@url,@index,0) if withProgress
|
75
75
|
|
76
|
-
|
76
|
+
html = Nokogiri::HTML(open(url))
|
77
|
+
|
78
|
+
return html
|
77
79
|
end
|
78
80
|
|
79
81
|
##
|
@@ -108,6 +110,8 @@ class BitReaper
|
|
108
110
|
when "download"
|
109
111
|
val = val
|
110
112
|
val.downloadAs($outputDest,(param.is_a? String) ? param : nil)
|
113
|
+
when "exclude"
|
114
|
+
val = false
|
111
115
|
end
|
112
116
|
return val
|
113
117
|
end
|
@@ -126,9 +130,9 @@ class BitReaper
|
|
126
130
|
when "join"
|
127
131
|
val = val.join(param)
|
128
132
|
when "first"
|
129
|
-
val = val.first
|
133
|
+
val = param==true ? val.first : val.first(param)
|
130
134
|
when "last"
|
131
|
-
val = val.last
|
135
|
+
val = param==true ? val.last : val.last(param)
|
132
136
|
when "index"
|
133
137
|
val = val[param.to_i]
|
134
138
|
when "select.include"
|
@@ -143,10 +147,42 @@ class BitReaper
|
|
143
147
|
else
|
144
148
|
val = val.select{|r| r==param }
|
145
149
|
end
|
150
|
+
when "exclude"
|
151
|
+
val = false
|
146
152
|
end
|
147
153
|
return val
|
148
154
|
end
|
149
155
|
|
156
|
+
##
|
157
|
+
# Process Hash value using attribute
|
158
|
+
#
|
159
|
+
# @param [String] attrb The attribute to be processed
|
160
|
+
# @param [Array] val The value to processed
|
161
|
+
# @param [String] param The attribute's param (if any)
|
162
|
+
#
|
163
|
+
# @return [String,Array] The result of the operation
|
164
|
+
|
165
|
+
def processHashValue(attrb,val,param)
|
166
|
+
case attrb
|
167
|
+
when "list"
|
168
|
+
val = squish(val)
|
169
|
+
# toret = []
|
170
|
+
# list = val.first[1]
|
171
|
+
# list.each_with_index{|l,i|
|
172
|
+
# dict = {}
|
173
|
+
# val.keys.each{|key|
|
174
|
+
# if val[key].is_a? Array
|
175
|
+
# if i<val[key].count
|
176
|
+
# dict[key] = val[key][i]
|
177
|
+
# end
|
178
|
+
# end
|
179
|
+
# }
|
180
|
+
# toret << dict
|
181
|
+
# }
|
182
|
+
# val = toret
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
150
186
|
##
|
151
187
|
# Process parsed values using set of attributes
|
152
188
|
#
|
@@ -157,8 +193,14 @@ class BitReaper
|
|
157
193
|
|
158
194
|
def processValues(values,attrbs)
|
159
195
|
# check if we have a single value or an array of values
|
160
|
-
|
161
|
-
|
196
|
+
if values.is_a? Nokogiri::XML::NodeSet
|
197
|
+
# it is a nodeset, so let's extract the .content property
|
198
|
+
ret = (values.count==1) ? values[0].content
|
199
|
+
: values.map{|v| v.content}
|
200
|
+
else
|
201
|
+
# not a nodeset (perhaps a hash of values?)
|
202
|
+
ret = values
|
203
|
+
end
|
162
204
|
|
163
205
|
# no attributes, just return it
|
164
206
|
return ret if attrbs.size==0
|
@@ -168,15 +210,21 @@ class BitReaper
|
|
168
210
|
# get params if we have multiple params; or not
|
169
211
|
param = (arg.include? "||") ? (arg.split("||").map{|a| Liquid::Template.parse(a).render(@store) })
|
170
212
|
: Liquid::Template.parse(arg).render(@store)
|
213
|
+
else
|
214
|
+
param = arg
|
171
215
|
end
|
172
216
|
|
173
217
|
if ret.is_a? String
|
174
218
|
# if our value is a String, process it accordingly
|
175
219
|
ret = self.processStringValue(attrb,ret,param)
|
176
|
-
|
220
|
+
elsif ret.is_a? Array
|
177
221
|
# it's an array of values, so look for array-operating attributes
|
178
222
|
ret = self.processArrayValue(attrb,ret,param)
|
179
|
-
|
223
|
+
elsif ret.is_a? Hash
|
224
|
+
# it's a value hash, so process it accordingly
|
225
|
+
ret = self.processHashValue(attrb,ret,param)
|
226
|
+
else
|
227
|
+
## Wtf is that?
|
180
228
|
end
|
181
229
|
}
|
182
230
|
|
@@ -194,26 +242,41 @@ class BitReaper
|
|
194
242
|
def processNode(noko,node,store,level=0)
|
195
243
|
node.children.each{|child|
|
196
244
|
command = child.namespace
|
197
|
-
tag = child.name
|
245
|
+
tag = Liquid::Template.parse(child.name).render(@store)
|
198
246
|
pattern = child.values[0]
|
199
247
|
attrs = child.attributes
|
200
248
|
|
201
|
-
if
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
249
|
+
if not command==""
|
250
|
+
case tag
|
251
|
+
when "fetch"
|
252
|
+
gotoUrl = Liquid::Template.parse(pattern).render(@store)
|
253
|
+
br = BitReaper.new(gotoUrl,child)
|
254
|
+
store.merge! br.process()
|
207
255
|
end
|
208
256
|
else
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
257
|
+
if child.children.count==0
|
258
|
+
# no children, so it's a "get"
|
259
|
+
values = noko.search(pattern)
|
260
|
+
|
261
|
+
if values.count>0
|
262
|
+
processed = self.processValues(values,attrs)
|
263
|
+
if processed!=false
|
264
|
+
store[tag] = processed
|
265
|
+
end
|
266
|
+
end
|
213
267
|
else
|
214
|
-
|
268
|
+
# it's a "section"
|
269
|
+
store[tag] = {}
|
270
|
+
|
271
|
+
if pattern.nil?
|
272
|
+
subnoko = noko
|
273
|
+
else
|
274
|
+
subnoko = noko.search(pattern)
|
275
|
+
end
|
276
|
+
|
277
|
+
processNode(subnoko,child,store[tag],level+1)
|
278
|
+
store[tag] = self.processValues(store[tag],attrs)
|
215
279
|
end
|
216
|
-
processNode(subnoko,child,store[tag],level+1)
|
217
280
|
end
|
218
281
|
}
|
219
282
|
end
|
data/lib/bitreaper/helpers.rb
CHANGED
@@ -48,6 +48,16 @@ class String
|
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
51
|
+
def squish(ha)
|
52
|
+
h = ha
|
53
|
+
h.each{ |key,val|
|
54
|
+
if not val.nil? and val.is_a? Hash
|
55
|
+
h[key] = squish(val)
|
56
|
+
end
|
57
|
+
}
|
58
|
+
h.values.then { |a, *b| a.zip *b }.map { |e| (h.keys.zip e).to_h }
|
59
|
+
end
|
60
|
+
|
51
61
|
## Core
|
52
62
|
|
53
63
|
def printLogo
|