bitreaper 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/bitreaper +27 -9
- data/lib/bitreaper.rb +71 -13
- data/lib/bitreaper/helpers.rb +36 -2
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 806186133e8f475e8040fc7bee4e676d49665f83f16bcb127c171b7239e0aa94
|
4
|
+
data.tar.gz: 693e9fbe65d0b4e697c9cf7eaa871ed905b35509b8a0ff7dd0953933c8b77635
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3080d190593d846cb6cc225c9e775e3899725a10e8785b7d506ba931d12203bdf76f7b0fe533f09b6b074761c1df9f0d46a80c513849377b1ec0cc8b49fdf4f6
|
7
|
+
data.tar.gz: 84a2212ce1ab9c9ccd3a43cec0bcff53d3b6ff814d56bc833488668193b7e8ffc80b18732a07b0c5409fa392e645c8c7de5a58f2f052da3bcee772adffe26dbe
|
data/bin/bitreaper
CHANGED
@@ -13,6 +13,8 @@
|
|
13
13
|
|
14
14
|
require 'colorize'
|
15
15
|
require 'optparse'
|
16
|
+
require 'parallel'
|
17
|
+
require 'ruby-progressbar'
|
16
18
|
|
17
19
|
require 'bitreaper'
|
18
20
|
|
@@ -24,8 +26,9 @@ require 'bitreaper'
|
|
24
26
|
|
25
27
|
$url = ""
|
26
28
|
$inputFile = ""
|
27
|
-
$
|
29
|
+
$outputDest = "output"
|
28
30
|
|
31
|
+
$parallel = false
|
29
32
|
$verbose = false
|
30
33
|
|
31
34
|
# Parse command-line options
|
@@ -36,9 +39,10 @@ ARGV.options do |opts|
|
|
36
39
|
"Usage: bitreaper <parser> [options]\n\n"
|
37
40
|
|
38
41
|
opts.on("-iINPUT", "--input==INPUT",String,"Set input file") {|val| $inputFile = val }
|
39
|
-
opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output
|
42
|
+
opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output name") {|val| $outputDest = val }
|
40
43
|
opts.on("-uURL", "--url=URL",String,"Set single url input") {|val| $url = val }
|
41
|
-
opts.on("-
|
44
|
+
opts.on("-p", "--parallel", "Perform multi-threaded processing"){|val| $parallel = true }
|
45
|
+
opts.on("-v", "--verbose", "Print debug messages") {|val| $verbose = true }
|
42
46
|
opts.on_tail("-h", "--help","Show this help page") {
|
43
47
|
puts opts
|
44
48
|
puts ""
|
@@ -82,13 +86,27 @@ if $inputFile!=""
|
|
82
86
|
|
83
87
|
# Process project
|
84
88
|
$store = []
|
85
|
-
$
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
+
if not $parallel
|
90
|
+
$urls.each_with_index{|url,i|
|
91
|
+
br = BitReaper.new(url,$parser,i)
|
92
|
+
$store << br.process()
|
93
|
+
}
|
94
|
+
else
|
95
|
+
if not $verbose
|
96
|
+
Parallel.each_with_index($urls, in_threads: 4, progress: " ► Processing... "){|url,i|
|
97
|
+
br = BitReaper.new(url,$parser,i)
|
98
|
+
$store << br.process()
|
99
|
+
}
|
100
|
+
else
|
101
|
+
Parallel.each_with_index($urls, in_threads: 4){|url,i|
|
102
|
+
br = BitReaper.new(url,$parser,i)
|
103
|
+
$store << br.process()
|
104
|
+
}
|
105
|
+
end
|
106
|
+
end
|
89
107
|
|
90
108
|
# Save store to file
|
91
|
-
saveStoreToFile($
|
109
|
+
saveStoreToFile("#{$outputDest}.json",$store)
|
92
110
|
|
93
111
|
else
|
94
112
|
|
@@ -101,7 +119,7 @@ else
|
|
101
119
|
$store = br.process()
|
102
120
|
|
103
121
|
# Save store to file
|
104
|
-
saveStoreToFile($
|
122
|
+
saveStoreToFile("#{$outputDest}.json",$store)
|
105
123
|
|
106
124
|
end
|
107
125
|
|
data/lib/bitreaper.rb
CHANGED
@@ -26,13 +26,22 @@ require_relative 'bitreaper/helpers.rb'
|
|
26
26
|
# SUPERGLOBALS
|
27
27
|
##########################################
|
28
28
|
|
29
|
-
$bitreaper_version = 0.1
|
29
|
+
$bitreaper_version = 0.1.2
|
30
30
|
|
31
|
-
|
32
|
-
# MAIN CLASS
|
33
|
-
|
31
|
+
####################################################################################
|
32
|
+
# **MAIN CLASS**
|
33
|
+
# This is the main Web Scraper object. It is through a `BitScraper` instance
|
34
|
+
# that you can start scraping
|
35
|
+
####################################################################################
|
34
36
|
|
35
37
|
class BitReaper
|
38
|
+
|
39
|
+
# Create a new BitReaper instance
|
40
|
+
#
|
41
|
+
# @param [String] url The URL of the page to be scraped
|
42
|
+
# @param [String,SDL4R::Tag] parser The parser
|
43
|
+
# @param [Integer] i Index of the current operation (for reporting purposes)
|
44
|
+
#---------------------------------------------------------------------------
|
36
45
|
def initialize(url,parser,i=0)
|
37
46
|
@url = url
|
38
47
|
@parser = (parser.is_a? String) ? self.getParser(parser) : parser
|
@@ -43,10 +52,16 @@ class BitReaper
|
|
43
52
|
@noko = self.download(@url)
|
44
53
|
end
|
45
54
|
|
55
|
+
# Get a new parser from a given parser path
|
56
|
+
#
|
57
|
+
# @param [String] file The path of the `.br` parser file
|
58
|
+
#
|
59
|
+
# @return [SDL4R::Tag] The resulting parser
|
60
|
+
#---------------------------------------------------------------------------
|
46
61
|
def self.getParser(file)
|
47
62
|
parserFile = File.read(file)
|
48
63
|
parserFile = parserFile.gsub(/([\w]+)\!\s/,'\1=on')
|
49
|
-
if
|
64
|
+
if $verbose
|
50
65
|
puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
|
51
66
|
puts ""
|
52
67
|
end
|
@@ -54,12 +69,38 @@ class BitReaper
|
|
54
69
|
return SDL4R::read(parserFile)
|
55
70
|
end
|
56
71
|
|
72
|
+
# Process current project
|
73
|
+
#---------------------------------------------------------------------------
|
74
|
+
def process
|
75
|
+
printProgress(@url,@index,1)
|
76
|
+
processNode(@noko, @parser, @store)
|
77
|
+
|
78
|
+
printProgress(@url,@index,2)
|
79
|
+
return @store
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
# Download given URL
|
85
|
+
#
|
86
|
+
# @param [String] url The URL to be downloaded
|
87
|
+
#
|
88
|
+
# @return [Nokogiri::XML::NodeSet] The resulting nodes
|
89
|
+
#---------------------------------------------------------------------------
|
57
90
|
def download(url,withProgress=true)
|
58
91
|
printProgress(@url,@index,0) if withProgress
|
59
92
|
|
60
93
|
return Nokogiri::HTML(open(url))
|
61
94
|
end
|
62
95
|
|
96
|
+
# Process String value using attribute
|
97
|
+
#
|
98
|
+
# @param [String] attrb The attribute to be processed
|
99
|
+
# @param [String] val The value to processed
|
100
|
+
# @param [String] param The attribute's param (if any)
|
101
|
+
#
|
102
|
+
# @return [String,Array] The result of the operation
|
103
|
+
#---------------------------------------------------------------------------
|
63
104
|
def processStringValue(attrb,val,param)
|
64
105
|
case attrb
|
65
106
|
when "prepend"
|
@@ -80,10 +121,21 @@ class BitReaper
|
|
80
121
|
val = val.gsub(param,"")
|
81
122
|
when "split"
|
82
123
|
val = val.split(param)
|
124
|
+
when "download"
|
125
|
+
val = val
|
126
|
+
val.downloadAs($outputDest,(param.is_a? String) ? param : nil)
|
83
127
|
end
|
84
128
|
return val
|
85
129
|
end
|
86
130
|
|
131
|
+
# Process Array value using attribute
|
132
|
+
#
|
133
|
+
# @param [String] attrb The attribute to be processed
|
134
|
+
# @param [Array] val The value to processed
|
135
|
+
# @param [String] param The attribute's param (if any)
|
136
|
+
#
|
137
|
+
# @return [String,Array] The result of the operation
|
138
|
+
#---------------------------------------------------------------------------
|
87
139
|
def processArrayValue(attrb,val,param)
|
88
140
|
case attrb
|
89
141
|
when "join"
|
@@ -110,6 +162,13 @@ class BitReaper
|
|
110
162
|
return val
|
111
163
|
end
|
112
164
|
|
165
|
+
# Process parsed values using set of attributes
|
166
|
+
#
|
167
|
+
# @param [Array] values The parsed values
|
168
|
+
# @param [Array] attrbs The associated attributes
|
169
|
+
#
|
170
|
+
# @return [String,Array] The result of the operation
|
171
|
+
#---------------------------------------------------------------------------
|
113
172
|
def processValues(values,attrbs)
|
114
173
|
# check if we have a single value or an array of values
|
115
174
|
ret = (values.count==1) ? values[0].content
|
@@ -138,6 +197,13 @@ class BitReaper
|
|
138
197
|
return (ret.nil?) ? "" : ret
|
139
198
|
end
|
140
199
|
|
200
|
+
# Process a given node using provided parser and temporary storage hash
|
201
|
+
#
|
202
|
+
# @param [Nokogiri::XML::node] noko The Nokogiri node to work on
|
203
|
+
# @param [SDL4R::Tag] node The parser node
|
204
|
+
# @param [Hash] store The temporary storage hash
|
205
|
+
# @param [Integer] level The nesting level (for informational purposes)
|
206
|
+
#---------------------------------------------------------------------------
|
141
207
|
def processNode(noko,node,store,level=0)
|
142
208
|
node.children.each{|child|
|
143
209
|
command = child.namespace
|
@@ -165,14 +231,6 @@ class BitReaper
|
|
165
231
|
}
|
166
232
|
end
|
167
233
|
|
168
|
-
def process
|
169
|
-
printProgress(@url,@index,1)
|
170
|
-
processNode(@noko, @parser, @store)
|
171
|
-
|
172
|
-
printProgress(@url,@index,2)
|
173
|
-
return @store
|
174
|
-
end
|
175
|
-
|
176
234
|
end
|
177
235
|
|
178
236
|
#######################################################
|
data/lib/bitreaper/helpers.rb
CHANGED
@@ -10,10 +10,15 @@
|
|
10
10
|
# @file lib/bitreaper/helpers.rb
|
11
11
|
#######################################################
|
12
12
|
|
13
|
+
require 'down'
|
14
|
+
require 'fileutils'
|
15
|
+
|
13
16
|
##########################################
|
14
17
|
# HELPER FUNCTIONS
|
15
18
|
##########################################
|
16
19
|
|
20
|
+
## Misc
|
21
|
+
|
17
22
|
class String
|
18
23
|
def ellipsisize(minimum_length=15,edge_length=15)
|
19
24
|
return self if self.length < minimum_length or self.length <= edge_length*2
|
@@ -21,8 +26,30 @@ class String
|
|
21
26
|
mid_length = self.length - edge_length*2
|
22
27
|
gsub(/(#{edge}).{#{mid_length},}(#{edge})/, '\1...\2')
|
23
28
|
end
|
29
|
+
|
30
|
+
def downloadAs(dest,filename=nil)
|
31
|
+
# make sure 'dest' path and any dir included in the filename path exist
|
32
|
+
FileUtils.mkdir_p dest unless Dir.exist? dest
|
33
|
+
if not filename.nil?
|
34
|
+
subdir = File.join(dest, File.dirname(filename))
|
35
|
+
FileUtils.mkdir_p subdir unless Dir.exist? subdir
|
36
|
+
end
|
37
|
+
|
38
|
+
# download it
|
39
|
+
tmpfile = Down.download(self)
|
40
|
+
|
41
|
+
# in case a filename is specified, save it like that
|
42
|
+
# otherwise, try using the original filaname
|
43
|
+
if not filename.nil?
|
44
|
+
FileUtils.mv(tmpfile.path, "#{dest}/#{filename}")
|
45
|
+
else
|
46
|
+
FileUtils.mv(tmpfile.path, "#{dest}/#{tmpfile.original_filename}")
|
47
|
+
end
|
48
|
+
end
|
24
49
|
end
|
25
50
|
|
51
|
+
## Core
|
52
|
+
|
26
53
|
def printLogo
|
27
54
|
puts (" ____ _ _ ____\n" +
|
28
55
|
" | __ )(_) |_| _ \\ ___ __ _ _ __ ___ _ __\n" +
|
@@ -66,8 +93,15 @@ def printProgress(item,indx,stage)
|
|
66
93
|
msg += "\n"
|
67
94
|
end
|
68
95
|
|
69
|
-
|
70
|
-
|
96
|
+
toPrint = (" ► " + "[#{indx+1}/#{$total}] ".ljust(12)).bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
|
97
|
+
if $parallel
|
98
|
+
if $verbose
|
99
|
+
puts toPrint
|
100
|
+
end
|
101
|
+
else
|
102
|
+
print "\r"
|
103
|
+
print toPrint
|
104
|
+
end
|
71
105
|
end
|
72
106
|
|
73
107
|
def saveStoreToFile(file,store)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bitreaper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dr.Kameleon
|
@@ -10,8 +10,9 @@ bindir: bin
|
|
10
10
|
cert_chain: []
|
11
11
|
date: 2020-04-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: Automated Web-Scraping Client for Ruby
|
14
|
-
|
13
|
+
description: Automated Web-Scraping Client for Ruby using SLD2-like configuration
|
14
|
+
files. Supports XPath and CSS selectors via Nokogiri.
|
15
|
+
email: drkameleon@gmail.com
|
15
16
|
executables:
|
16
17
|
- bitreaper
|
17
18
|
extensions: []
|
@@ -20,7 +21,7 @@ files:
|
|
20
21
|
- bin/bitreaper
|
21
22
|
- lib/bitreaper.rb
|
22
23
|
- lib/bitreaper/helpers.rb
|
23
|
-
homepage: https://
|
24
|
+
homepage: https://github.com/drkameleon/BitReaper
|
24
25
|
licenses:
|
25
26
|
- MIT
|
26
27
|
metadata: {}
|