bitreaper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/bitreaper +113 -0
- data/lib/bitreaper.rb +183 -0
- data/lib/bitreaper/helpers.rb +91 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ef322874a2e0f8e3876e554671d80ff724d6fb598be0c7e25be992c268bbac5a
|
4
|
+
data.tar.gz: c6c21e747415b48c0246cf07bb95d4512238d358859699593235365a69c0f752
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3c7fd3310b52989dab3299294944c507ca39cecffd02544ce6c4a9f95a2db2016dee184c9bbdb5848ba5411297daa021abb9ff1c010ee2250976561fd3af5f08
|
7
|
+
data.tar.gz: 6d0294a916a199ddf934ad873b7943a2a8106e1c4209b5d2125c26fd5f2fb882d5f8730c24495e0286687b22aeda8329301359b6c11761735c354dabde60a81f
|
data/bin/bitreaper
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#######################################################
|
3
|
+
# BitReaper
|
4
|
+
# Automated Web-Scraping Client for Ruby
|
5
|
+
#
|
6
|
+
# (c)2020 Yanis Zafirópulos
|
7
|
+
# aka Dr.Kameleon
|
8
|
+
#
|
9
|
+
# <yaniszaf@gmail.com>
|
10
|
+
#######################################################
|
11
|
+
# @file bin/bitreaper
|
12
|
+
#######################################################
|
13
|
+
|
14
|
+
require 'colorize'
|
15
|
+
require 'optparse'
|
16
|
+
|
17
|
+
require 'bitreaper'
|
18
|
+
|
19
|
+
##########################################
|
20
|
+
# ENTRY
|
21
|
+
##########################################
|
22
|
+
|
23
|
+
# Set defaults
|
24
|
+
|
25
|
+
$url = ""
|
26
|
+
$inputFile = ""
|
27
|
+
$outputFile = "output.json"
|
28
|
+
|
29
|
+
$verbose = false
|
30
|
+
|
31
|
+
# Parse command-line options
|
32
|
+
|
33
|
+
ARGV.options do |opts|
|
34
|
+
opts.banner = "BitReaper v0.1.0\n".bold +
|
35
|
+
"(c)2020 Dr.Kameleon\n\n"+
|
36
|
+
"Usage: bitreaper <parser> [options]\n\n"
|
37
|
+
|
38
|
+
opts.on("-iINPUT", "--input==INPUT",String,"Set input file") {|val| $inputFile = val }
|
39
|
+
opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output file") {|val| $outputFile = val }
|
40
|
+
opts.on("-uURL", "--url=URL",String,"Set single url input") {|val| $url = val }
|
41
|
+
opts.on("-v", "--verbose", String, "Print debug messages") {|val| $verbose = true }
|
42
|
+
opts.on_tail("-h", "--help","Show this help page") {
|
43
|
+
puts opts
|
44
|
+
puts ""
|
45
|
+
exit
|
46
|
+
}
|
47
|
+
opts.parse!
|
48
|
+
end
|
49
|
+
|
50
|
+
# Check if everything is correct
|
51
|
+
|
52
|
+
showError("Parser file not provided!\nexpected: bitreaper <parser> [options]") if ARGV.count == 0
|
53
|
+
showError("Too many arguments provided!\nexpected: bitreaper <parser> [options]") if ARGV.count > 1
|
54
|
+
showError("Both input file and single url provided!\nexpected: one of the two options") if $url!="" and $inputFile!=""
|
55
|
+
|
56
|
+
$parserFile = ARGV[0]
|
57
|
+
showError("Parser file does not exist!\npath given: #{$parserFile}") unless File.exist? $parserFile
|
58
|
+
|
59
|
+
if $inputFile!=""
|
60
|
+
showError("Input file does not exist!\npath given: #{$inputFile}") unless File.exist? $inputFile
|
61
|
+
end
|
62
|
+
|
63
|
+
# Show our logo
|
64
|
+
|
65
|
+
printLogo()
|
66
|
+
|
67
|
+
# Read parser
|
68
|
+
|
69
|
+
showInfo("reading parser: #{$parserFile}")
|
70
|
+
$parser = BitReaper.getParser($parserFile)
|
71
|
+
|
72
|
+
if $inputFile!=""
|
73
|
+
|
74
|
+
# Read urls
|
75
|
+
showInfo("reading urls: #{$inputFile}")
|
76
|
+
$urls = File.read($inputFile)
|
77
|
+
$urls = $urls.split("\n")
|
78
|
+
|
79
|
+
# Initial setup
|
80
|
+
$total = $urls.count
|
81
|
+
puts "\n"
|
82
|
+
|
83
|
+
# Process project
|
84
|
+
$store = []
|
85
|
+
$urls.each_with_index{|url,i|
|
86
|
+
br = BitReaper.new(url,$parser,i)
|
87
|
+
$store << br.process()
|
88
|
+
}
|
89
|
+
|
90
|
+
# Save store to file
|
91
|
+
saveStoreToFile($outputFile,$store)
|
92
|
+
|
93
|
+
else
|
94
|
+
|
95
|
+
# Initial setup
|
96
|
+
$total = 1
|
97
|
+
puts "\n"
|
98
|
+
|
99
|
+
# Process single url
|
100
|
+
br = BitReaper.new($url,$parser,1)
|
101
|
+
$store = br.process()
|
102
|
+
|
103
|
+
# Save store to file
|
104
|
+
saveStoreToFile($outputFile,$store)
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
#######################################################
|
109
|
+
#
|
110
|
+
# This is the end;
|
111
|
+
# my only friend, the end...
|
112
|
+
#
|
113
|
+
#######################################################
|
data/lib/bitreaper.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
#######################################################
|
2
|
+
# BitReaper
|
3
|
+
# Automated Web-Scraping Client for Ruby
|
4
|
+
#
|
5
|
+
# (c)2020 Yanis Zafirópulos
|
6
|
+
# aka Dr.Kameleon
|
7
|
+
#
|
8
|
+
# <yaniszaf@gmail.com>
|
9
|
+
#######################################################
|
10
|
+
# @file lib/bitreaper.rb
|
11
|
+
#######################################################
|
12
|
+
|
13
|
+
require 'awesome_print'
|
14
|
+
require 'colorize'
|
15
|
+
require 'json'
|
16
|
+
require 'liquid'
|
17
|
+
require 'nokogiri'
|
18
|
+
require 'open-uri'
|
19
|
+
require 'sdl4r'
|
20
|
+
require 'watir'
|
21
|
+
require 'webdrivers'
|
22
|
+
|
23
|
+
require_relative 'bitreaper/helpers.rb'
|
24
|
+
|
25
|
+
##########################################
|
26
|
+
# SUPERGLOBALS
|
27
|
+
##########################################
|
28
|
+
|
29
|
+
$bitreaper_version = 0.1
|
30
|
+
|
31
|
+
##########################################
|
32
|
+
# MAIN CLASS
|
33
|
+
##########################################
|
34
|
+
|
35
|
+
class BitReaper
|
36
|
+
def initialize(url,parser,i=0)
|
37
|
+
@url = url
|
38
|
+
@parser = (parser.is_a? String) ? self.getParser(parser) : parser
|
39
|
+
|
40
|
+
@index = i
|
41
|
+
@store = {}
|
42
|
+
|
43
|
+
@noko = self.download(@url)
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.getParser(file)
|
47
|
+
parserFile = File.read(file)
|
48
|
+
parserFile = parserFile.gsub(/([\w]+)\!\s/,'\1=on')
|
49
|
+
if true
|
50
|
+
puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
|
51
|
+
puts ""
|
52
|
+
end
|
53
|
+
|
54
|
+
return SDL4R::read(parserFile)
|
55
|
+
end
|
56
|
+
|
57
|
+
def download(url,withProgress=true)
|
58
|
+
printProgress(@url,@index,0) if withProgress
|
59
|
+
|
60
|
+
return Nokogiri::HTML(open(url))
|
61
|
+
end
|
62
|
+
|
63
|
+
def processStringValue(attrb,val,param)
|
64
|
+
case attrb
|
65
|
+
when "prepend"
|
66
|
+
val = param + val
|
67
|
+
when "append"
|
68
|
+
val = val + param
|
69
|
+
when "capitalize"
|
70
|
+
val = val.capitalize
|
71
|
+
when "uppercase"
|
72
|
+
val = val.upcase
|
73
|
+
when "lowercase"
|
74
|
+
val = val.downcase
|
75
|
+
when "trim"
|
76
|
+
val = val.strip
|
77
|
+
when "replace"
|
78
|
+
val = val.gsub(param[0], param[1])
|
79
|
+
when "remove"
|
80
|
+
val = val.gsub(param,"")
|
81
|
+
when "split"
|
82
|
+
val = val.split(param)
|
83
|
+
end
|
84
|
+
return val
|
85
|
+
end
|
86
|
+
|
87
|
+
def processArrayValue(attrb,val,param)
|
88
|
+
case attrb
|
89
|
+
when "join"
|
90
|
+
val = val.join(param)
|
91
|
+
when "first"
|
92
|
+
val = val.first
|
93
|
+
when "last"
|
94
|
+
val = val.last
|
95
|
+
when "index"
|
96
|
+
val = val[param.to_i]
|
97
|
+
when "select.include"
|
98
|
+
if param.start_with? "/"
|
99
|
+
val = val.select{|r| r=~Regexp.new(param.tr('/', '')) }
|
100
|
+
else
|
101
|
+
val = val.select{|r| r.include? param }
|
102
|
+
end
|
103
|
+
when "select.match"
|
104
|
+
if param.start_with? "/"
|
105
|
+
val = val.select{|r| r=~Regexp.new("\A#{param.tr('/','')}\Z") }
|
106
|
+
else
|
107
|
+
val = val.select{|r| r==param }
|
108
|
+
end
|
109
|
+
end
|
110
|
+
return val
|
111
|
+
end
|
112
|
+
|
113
|
+
def processValues(values,attrbs)
|
114
|
+
# check if we have a single value or an array of values
|
115
|
+
ret = (values.count==1) ? values[0].content
|
116
|
+
: values.map{|v| v.content}
|
117
|
+
|
118
|
+
# no attributes, just return it
|
119
|
+
return ret if attrbs.size==0
|
120
|
+
|
121
|
+
attrbs.each{|attrb,arg|
|
122
|
+
if arg.is_a? String
|
123
|
+
# get params if we have multiple params; or not
|
124
|
+
param = (arg.include? "||") ? (arg.split("||").map{|a| Liquid::Template.parse(a).render(@store) })
|
125
|
+
: Liquid::Template.parse(arg).render(@store)
|
126
|
+
end
|
127
|
+
|
128
|
+
if ret.is_a? String
|
129
|
+
# if our value is a String, process it accordingly
|
130
|
+
ret = self.processStringValue(attrb,ret,param)
|
131
|
+
else
|
132
|
+
# it's an array of values, so look for array-operating attributes
|
133
|
+
ret = self.processArrayValue(attrb,ret,param)
|
134
|
+
|
135
|
+
end
|
136
|
+
}
|
137
|
+
|
138
|
+
return (ret.nil?) ? "" : ret
|
139
|
+
end
|
140
|
+
|
141
|
+
def processNode(noko,node,store,level=0)
|
142
|
+
node.children.each{|child|
|
143
|
+
command = child.namespace
|
144
|
+
tag = child.name
|
145
|
+
pattern = child.values[0]
|
146
|
+
attrs = child.attributes
|
147
|
+
|
148
|
+
if child.children.count==0
|
149
|
+
# no children, so it's a "get"
|
150
|
+
values = noko.search(pattern)
|
151
|
+
|
152
|
+
if values.count>0
|
153
|
+
store[tag] = self.processValues(values, attrs)
|
154
|
+
end
|
155
|
+
else
|
156
|
+
# it's a "section"
|
157
|
+
store[tag] = {}
|
158
|
+
if pattern.nil?
|
159
|
+
subnoko = noko
|
160
|
+
else
|
161
|
+
subnoko = noko.search(pattern)
|
162
|
+
end
|
163
|
+
processNode(subnoko,child,store[tag],level+1)
|
164
|
+
end
|
165
|
+
}
|
166
|
+
end
|
167
|
+
|
168
|
+
def process
|
169
|
+
printProgress(@url,@index,1)
|
170
|
+
processNode(@noko, @parser, @store)
|
171
|
+
|
172
|
+
printProgress(@url,@index,2)
|
173
|
+
return @store
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
178
|
+
#######################################################
|
179
|
+
#
|
180
|
+
# This is the end;
|
181
|
+
# my only friend, the end...
|
182
|
+
#
|
183
|
+
#######################################################
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#######################################################
|
2
|
+
# BitReaper
|
3
|
+
# Automated Web-Scraping Client for Ruby
|
4
|
+
#
|
5
|
+
# (c)2020 Yanis Zafirópulos
|
6
|
+
# aka Dr.Kameleon
|
7
|
+
#
|
8
|
+
# <yaniszaf@gmail.com>
|
9
|
+
#######################################################
|
10
|
+
# @file lib/bitreaper/helpers.rb
|
11
|
+
#######################################################
|
12
|
+
|
13
|
+
##########################################
|
14
|
+
# HELPER FUNCTIONS
|
15
|
+
##########################################
|
16
|
+
|
17
|
+
class String
|
18
|
+
def ellipsisize(minimum_length=15,edge_length=15)
|
19
|
+
return self if self.length < minimum_length or self.length <= edge_length*2
|
20
|
+
edge = '.'*edge_length
|
21
|
+
mid_length = self.length - edge_length*2
|
22
|
+
gsub(/(#{edge}).{#{mid_length},}(#{edge})/, '\1...\2')
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def printLogo
|
27
|
+
puts (" ____ _ _ ____\n" +
|
28
|
+
" | __ )(_) |_| _ \\ ___ __ _ _ __ ___ _ __\n" +
|
29
|
+
" | _ \\| | __| |_) / _ \\/ _` | '_ \\ / _ \\ '__|\n" +
|
30
|
+
" | |_) | | |_| _ < __/ (_| | |_) | __/ |\n" +
|
31
|
+
" |____/|_|\\__|_| \\_\\___|\\__,_| .__/ \\___|_|\n").light_cyan.bold +
|
32
|
+
|
33
|
+
" (c) 2020, Dr.Kameleon".cyan + " |_| ".light_cyan.bold
|
34
|
+
|
35
|
+
puts ""
|
36
|
+
end
|
37
|
+
|
38
|
+
def showError(msg)
|
39
|
+
puts " BitReaper v0.1.0\n".bold +
|
40
|
+
" (c)2020 Dr.Kameleon\n"
|
41
|
+
puts "-" * 90
|
42
|
+
print " ✘ ERROR: ".light_red.bold
|
43
|
+
puts msg.split("\n").join("\n ")
|
44
|
+
puts "-" * 90
|
45
|
+
puts ""
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
|
49
|
+
def showInfo(msg)
|
50
|
+
puts " ● " + msg
|
51
|
+
end
|
52
|
+
|
53
|
+
def showSuccess(msg)
|
54
|
+
puts (" ● " + msg).light_green.bold
|
55
|
+
puts ""
|
56
|
+
end
|
57
|
+
|
58
|
+
def printProgress(item,indx,stage)
|
59
|
+
case stage
|
60
|
+
when 0
|
61
|
+
msg = "Downloading..."
|
62
|
+
when 1
|
63
|
+
msg = "Processing... "
|
64
|
+
when 2
|
65
|
+
msg = "OK ✔︎ ".light_green.bold
|
66
|
+
msg += "\n"
|
67
|
+
end
|
68
|
+
|
69
|
+
print "\r"
|
70
|
+
print " ► [#{indx+1}/#{$total}] ".bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
|
71
|
+
end
|
72
|
+
|
73
|
+
def saveStoreToFile(file,store)
|
74
|
+
puts ""
|
75
|
+
showInfo("finished processing #{$total} entries")
|
76
|
+
showInfo("saving to file: " + file)
|
77
|
+
puts "\n"
|
78
|
+
|
79
|
+
File.open(file,"w"){|f|
|
80
|
+
f.write(JSON.pretty_generate(store))
|
81
|
+
}
|
82
|
+
|
83
|
+
showSuccess("SUCCESS :)")
|
84
|
+
end
|
85
|
+
|
86
|
+
#######################################################
|
87
|
+
#
|
88
|
+
# This is the end;
|
89
|
+
# my only friend, the end...
|
90
|
+
#
|
91
|
+
#######################################################
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bitreaper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dr.Kameleon
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-04-09 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Automated Web-Scraping Client for Ruby
|
14
|
+
email: yaniszaf@gmail.com
|
15
|
+
executables:
|
16
|
+
- bitreaper
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/bitreaper
|
21
|
+
- lib/bitreaper.rb
|
22
|
+
- lib/bitreaper/helpers.rb
|
23
|
+
homepage: https://rubygems.org/gems/bitreaper
|
24
|
+
licenses:
|
25
|
+
- MIT
|
26
|
+
metadata: {}
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubygems_version: 3.0.4
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: BitReaper
|
46
|
+
test_files: []
|