bitreaper 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/bitreaper +27 -9
- data/lib/bitreaper.rb +71 -13
- data/lib/bitreaper/helpers.rb +36 -2
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 806186133e8f475e8040fc7bee4e676d49665f83f16bcb127c171b7239e0aa94
|
4
|
+
data.tar.gz: 693e9fbe65d0b4e697c9cf7eaa871ed905b35509b8a0ff7dd0953933c8b77635
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3080d190593d846cb6cc225c9e775e3899725a10e8785b7d506ba931d12203bdf76f7b0fe533f09b6b074761c1df9f0d46a80c513849377b1ec0cc8b49fdf4f6
|
7
|
+
data.tar.gz: 84a2212ce1ab9c9ccd3a43cec0bcff53d3b6ff814d56bc833488668193b7e8ffc80b18732a07b0c5409fa392e645c8c7de5a58f2f052da3bcee772adffe26dbe
|
data/bin/bitreaper
CHANGED
@@ -13,6 +13,8 @@
|
|
13
13
|
|
14
14
|
require 'colorize'
|
15
15
|
require 'optparse'
|
16
|
+
require 'parallel'
|
17
|
+
require 'ruby-progressbar'
|
16
18
|
|
17
19
|
require 'bitreaper'
|
18
20
|
|
@@ -24,8 +26,9 @@ require 'bitreaper'
|
|
24
26
|
|
25
27
|
$url = ""
|
26
28
|
$inputFile = ""
|
27
|
-
$
|
29
|
+
$outputDest = "output"
|
28
30
|
|
31
|
+
$parallel = false
|
29
32
|
$verbose = false
|
30
33
|
|
31
34
|
# Parse command-line options
|
@@ -36,9 +39,10 @@ ARGV.options do |opts|
|
|
36
39
|
"Usage: bitreaper <parser> [options]\n\n"
|
37
40
|
|
38
41
|
opts.on("-iINPUT", "--input==INPUT",String,"Set input file") {|val| $inputFile = val }
|
39
|
-
opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output
|
42
|
+
opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output name") {|val| $outputDest = val }
|
40
43
|
opts.on("-uURL", "--url=URL",String,"Set single url input") {|val| $url = val }
|
41
|
-
opts.on("-
|
44
|
+
opts.on("-p", "--parallel", "Perform multi-threaded processing"){|val| $parallel = true }
|
45
|
+
opts.on("-v", "--verbose", "Print debug messages") {|val| $verbose = true }
|
42
46
|
opts.on_tail("-h", "--help","Show this help page") {
|
43
47
|
puts opts
|
44
48
|
puts ""
|
@@ -82,13 +86,27 @@ if $inputFile!=""
|
|
82
86
|
|
83
87
|
# Process project
|
84
88
|
$store = []
|
85
|
-
$
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
+
if not $parallel
|
90
|
+
$urls.each_with_index{|url,i|
|
91
|
+
br = BitReaper.new(url,$parser,i)
|
92
|
+
$store << br.process()
|
93
|
+
}
|
94
|
+
else
|
95
|
+
if not $verbose
|
96
|
+
Parallel.each_with_index($urls, in_threads: 4, progress: " ► Processing... "){|url,i|
|
97
|
+
br = BitReaper.new(url,$parser,i)
|
98
|
+
$store << br.process()
|
99
|
+
}
|
100
|
+
else
|
101
|
+
Parallel.each_with_index($urls, in_threads: 4){|url,i|
|
102
|
+
br = BitReaper.new(url,$parser,i)
|
103
|
+
$store << br.process()
|
104
|
+
}
|
105
|
+
end
|
106
|
+
end
|
89
107
|
|
90
108
|
# Save store to file
|
91
|
-
saveStoreToFile($
|
109
|
+
saveStoreToFile("#{$outputDest}.json",$store)
|
92
110
|
|
93
111
|
else
|
94
112
|
|
@@ -101,7 +119,7 @@ else
|
|
101
119
|
$store = br.process()
|
102
120
|
|
103
121
|
# Save store to file
|
104
|
-
saveStoreToFile($
|
122
|
+
saveStoreToFile("#{$outputDest}.json",$store)
|
105
123
|
|
106
124
|
end
|
107
125
|
|
data/lib/bitreaper.rb
CHANGED
@@ -26,13 +26,22 @@ require_relative 'bitreaper/helpers.rb'
|
|
26
26
|
# SUPERGLOBALS
|
27
27
|
##########################################
|
28
28
|
|
29
|
-
$bitreaper_version = 0.1
|
29
|
+
$bitreaper_version = 0.1.2
|
30
30
|
|
31
|
-
|
32
|
-
# MAIN CLASS
|
33
|
-
|
31
|
+
####################################################################################
|
32
|
+
# **MAIN CLASS**
|
33
|
+
# This is the main Web Scraper object. It is through a `BitScraper` instance
|
34
|
+
# that you can start scraping
|
35
|
+
####################################################################################
|
34
36
|
|
35
37
|
class BitReaper
|
38
|
+
|
39
|
+
# Create a new BitReaper instance
|
40
|
+
#
|
41
|
+
# @param [String] url The URL of the page to be scraped
|
42
|
+
# @param [String,SDL4R::Tag] parser The parser
|
43
|
+
# @param [Integer] i Index of the current operation (for reporting purposes)
|
44
|
+
#---------------------------------------------------------------------------
|
36
45
|
def initialize(url,parser,i=0)
|
37
46
|
@url = url
|
38
47
|
@parser = (parser.is_a? String) ? self.getParser(parser) : parser
|
@@ -43,10 +52,16 @@ class BitReaper
|
|
43
52
|
@noko = self.download(@url)
|
44
53
|
end
|
45
54
|
|
55
|
+
# Get a new parser from a given parser path
|
56
|
+
#
|
57
|
+
# @param [String] file The path of the `.br` parser file
|
58
|
+
#
|
59
|
+
# @return [SDL4R::Tag] The resulting parser
|
60
|
+
#---------------------------------------------------------------------------
|
46
61
|
def self.getParser(file)
|
47
62
|
parserFile = File.read(file)
|
48
63
|
parserFile = parserFile.gsub(/([\w]+)\!\s/,'\1=on')
|
49
|
-
if
|
64
|
+
if $verbose
|
50
65
|
puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
|
51
66
|
puts ""
|
52
67
|
end
|
@@ -54,12 +69,38 @@ class BitReaper
|
|
54
69
|
return SDL4R::read(parserFile)
|
55
70
|
end
|
56
71
|
|
72
|
+
# Process current project
|
73
|
+
#---------------------------------------------------------------------------
|
74
|
+
def process
|
75
|
+
printProgress(@url,@index,1)
|
76
|
+
processNode(@noko, @parser, @store)
|
77
|
+
|
78
|
+
printProgress(@url,@index,2)
|
79
|
+
return @store
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
# Download given URL
|
85
|
+
#
|
86
|
+
# @param [String] url The URL to be downloaded
|
87
|
+
#
|
88
|
+
# @return [Nokogiri::XML::NodeSet] The resulting nodes
|
89
|
+
#---------------------------------------------------------------------------
|
57
90
|
def download(url,withProgress=true)
|
58
91
|
printProgress(@url,@index,0) if withProgress
|
59
92
|
|
60
93
|
return Nokogiri::HTML(open(url))
|
61
94
|
end
|
62
95
|
|
96
|
+
# Process String value using attribute
|
97
|
+
#
|
98
|
+
# @param [String] attrb The attribute to be processed
|
99
|
+
# @param [String] val The value to processed
|
100
|
+
# @param [String] param The attribute's param (if any)
|
101
|
+
#
|
102
|
+
# @return [String,Array] The result of the operation
|
103
|
+
#---------------------------------------------------------------------------
|
63
104
|
def processStringValue(attrb,val,param)
|
64
105
|
case attrb
|
65
106
|
when "prepend"
|
@@ -80,10 +121,21 @@ class BitReaper
|
|
80
121
|
val = val.gsub(param,"")
|
81
122
|
when "split"
|
82
123
|
val = val.split(param)
|
124
|
+
when "download"
|
125
|
+
val = val
|
126
|
+
val.downloadAs($outputDest,(param.is_a? String) ? param : nil)
|
83
127
|
end
|
84
128
|
return val
|
85
129
|
end
|
86
130
|
|
131
|
+
# Process Array value using attribute
|
132
|
+
#
|
133
|
+
# @param [String] attrb The attribute to be processed
|
134
|
+
# @param [Array] val The value to processed
|
135
|
+
# @param [String] param The attribute's param (if any)
|
136
|
+
#
|
137
|
+
# @return [String,Array] The result of the operation
|
138
|
+
#---------------------------------------------------------------------------
|
87
139
|
def processArrayValue(attrb,val,param)
|
88
140
|
case attrb
|
89
141
|
when "join"
|
@@ -110,6 +162,13 @@ class BitReaper
|
|
110
162
|
return val
|
111
163
|
end
|
112
164
|
|
165
|
+
# Process parsed values using set of attributes
|
166
|
+
#
|
167
|
+
# @param [Array] values The parsed values
|
168
|
+
# @param [Array] attrbs The associated attributes
|
169
|
+
#
|
170
|
+
# @return [String,Array] The result of the operation
|
171
|
+
#---------------------------------------------------------------------------
|
113
172
|
def processValues(values,attrbs)
|
114
173
|
# check if we have a single value or an array of values
|
115
174
|
ret = (values.count==1) ? values[0].content
|
@@ -138,6 +197,13 @@ class BitReaper
|
|
138
197
|
return (ret.nil?) ? "" : ret
|
139
198
|
end
|
140
199
|
|
200
|
+
# Process a given node using provided parser and temporary storage hash
|
201
|
+
#
|
202
|
+
# @param [Nokogiri::XML::node] noko The Nokogiri node to work on
|
203
|
+
# @param [SDL4R::Tag] node The parser node
|
204
|
+
# @param [Hash] store The temporary storage hash
|
205
|
+
# @param [Integer] level The nesting level (for informational purposes)
|
206
|
+
#---------------------------------------------------------------------------
|
141
207
|
def processNode(noko,node,store,level=0)
|
142
208
|
node.children.each{|child|
|
143
209
|
command = child.namespace
|
@@ -165,14 +231,6 @@ class BitReaper
|
|
165
231
|
}
|
166
232
|
end
|
167
233
|
|
168
|
-
def process
|
169
|
-
printProgress(@url,@index,1)
|
170
|
-
processNode(@noko, @parser, @store)
|
171
|
-
|
172
|
-
printProgress(@url,@index,2)
|
173
|
-
return @store
|
174
|
-
end
|
175
|
-
|
176
234
|
end
|
177
235
|
|
178
236
|
#######################################################
|
data/lib/bitreaper/helpers.rb
CHANGED
@@ -10,10 +10,15 @@
|
|
10
10
|
# @file lib/bitreaper/helpers.rb
|
11
11
|
#######################################################
|
12
12
|
|
13
|
+
require 'down'
|
14
|
+
require 'fileutils'
|
15
|
+
|
13
16
|
##########################################
|
14
17
|
# HELPER FUNCTIONS
|
15
18
|
##########################################
|
16
19
|
|
20
|
+
## Misc
|
21
|
+
|
17
22
|
class String
|
18
23
|
def ellipsisize(minimum_length=15,edge_length=15)
|
19
24
|
return self if self.length < minimum_length or self.length <= edge_length*2
|
@@ -21,8 +26,30 @@ class String
|
|
21
26
|
mid_length = self.length - edge_length*2
|
22
27
|
gsub(/(#{edge}).{#{mid_length},}(#{edge})/, '\1...\2')
|
23
28
|
end
|
29
|
+
|
30
|
+
def downloadAs(dest,filename=nil)
|
31
|
+
# make sure 'dest' path and any dir included in the filename path exist
|
32
|
+
FileUtils.mkdir_p dest unless Dir.exist? dest
|
33
|
+
if not filename.nil?
|
34
|
+
subdir = File.join(dest, File.dirname(filename))
|
35
|
+
FileUtils.mkdir_p subdir unless Dir.exist? subdir
|
36
|
+
end
|
37
|
+
|
38
|
+
# download it
|
39
|
+
tmpfile = Down.download(self)
|
40
|
+
|
41
|
+
# in case a filename is specified, save it like that
|
42
|
+
# otherwise, try using the original filaname
|
43
|
+
if not filename.nil?
|
44
|
+
FileUtils.mv(tmpfile.path, "#{dest}/#{filename}")
|
45
|
+
else
|
46
|
+
FileUtils.mv(tmpfile.path, "#{dest}/#{tmpfile.original_filename}")
|
47
|
+
end
|
48
|
+
end
|
24
49
|
end
|
25
50
|
|
51
|
+
## Core
|
52
|
+
|
26
53
|
def printLogo
|
27
54
|
puts (" ____ _ _ ____\n" +
|
28
55
|
" | __ )(_) |_| _ \\ ___ __ _ _ __ ___ _ __\n" +
|
@@ -66,8 +93,15 @@ def printProgress(item,indx,stage)
|
|
66
93
|
msg += "\n"
|
67
94
|
end
|
68
95
|
|
69
|
-
|
70
|
-
|
96
|
+
toPrint = (" ► " + "[#{indx+1}/#{$total}] ".ljust(12)).bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
|
97
|
+
if $parallel
|
98
|
+
if $verbose
|
99
|
+
puts toPrint
|
100
|
+
end
|
101
|
+
else
|
102
|
+
print "\r"
|
103
|
+
print toPrint
|
104
|
+
end
|
71
105
|
end
|
72
106
|
|
73
107
|
def saveStoreToFile(file,store)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bitreaper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dr.Kameleon
|
@@ -10,8 +10,9 @@ bindir: bin
|
|
10
10
|
cert_chain: []
|
11
11
|
date: 2020-04-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: Automated Web-Scraping Client for Ruby
|
14
|
-
|
13
|
+
description: Automated Web-Scraping Client for Ruby using SLD2-like configuration
|
14
|
+
files. Supports XPath and CSS selectors via Nokogiri.
|
15
|
+
email: drkameleon@gmail.com
|
15
16
|
executables:
|
16
17
|
- bitreaper
|
17
18
|
extensions: []
|
@@ -20,7 +21,7 @@ files:
|
|
20
21
|
- bin/bitreaper
|
21
22
|
- lib/bitreaper.rb
|
22
23
|
- lib/bitreaper/helpers.rb
|
23
|
-
homepage: https://
|
24
|
+
homepage: https://github.com/drkameleon/BitReaper
|
24
25
|
licenses:
|
25
26
|
- MIT
|
26
27
|
metadata: {}
|