bitreaper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/bitreaper +113 -0
- data/lib/bitreaper.rb +183 -0
- data/lib/bitreaper/helpers.rb +91 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ef322874a2e0f8e3876e554671d80ff724d6fb598be0c7e25be992c268bbac5a
|
4
|
+
data.tar.gz: c6c21e747415b48c0246cf07bb95d4512238d358859699593235365a69c0f752
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3c7fd3310b52989dab3299294944c507ca39cecffd02544ce6c4a9f95a2db2016dee184c9bbdb5848ba5411297daa021abb9ff1c010ee2250976561fd3af5f08
|
7
|
+
data.tar.gz: 6d0294a916a199ddf934ad873b7943a2a8106e1c4209b5d2125c26fd5f2fb882d5f8730c24495e0286687b22aeda8329301359b6c11761735c354dabde60a81f
|
data/bin/bitreaper
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#######################################################
|
3
|
+
# BitReaper
|
4
|
+
# Automated Web-Scraping Client for Ruby
|
5
|
+
#
|
6
|
+
# (c)2020 Yanis Zafirópulos
|
7
|
+
# aka Dr.Kameleon
|
8
|
+
#
|
9
|
+
# <yaniszaf@gmail.com>
|
10
|
+
#######################################################
|
11
|
+
# @file bin/bitreaper
|
12
|
+
#######################################################
|
13
|
+
|
14
|
+
require 'colorize'
|
15
|
+
require 'optparse'
|
16
|
+
|
17
|
+
require 'bitreaper'
|
18
|
+
|
19
|
+
##########################################
|
20
|
+
# ENTRY
|
21
|
+
##########################################
|
22
|
+
|
23
|
+
# Set defaults
|
24
|
+
|
25
|
+
$url = ""
|
26
|
+
$inputFile = ""
|
27
|
+
$outputFile = "output.json"
|
28
|
+
|
29
|
+
$verbose = false
|
30
|
+
|
31
|
+
# Parse command-line options
|
32
|
+
|
33
|
+
ARGV.options do |opts|
|
34
|
+
opts.banner = "BitReaper v0.1.0\n".bold +
|
35
|
+
"(c)2020 Dr.Kameleon\n\n"+
|
36
|
+
"Usage: bitreaper <parser> [options]\n\n"
|
37
|
+
|
38
|
+
opts.on("-iINPUT", "--input==INPUT",String,"Set input file") {|val| $inputFile = val }
|
39
|
+
opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output file") {|val| $outputFile = val }
|
40
|
+
opts.on("-uURL", "--url=URL",String,"Set single url input") {|val| $url = val }
|
41
|
+
opts.on("-v", "--verbose", String, "Print debug messages") {|val| $verbose = true }
|
42
|
+
opts.on_tail("-h", "--help","Show this help page") {
|
43
|
+
puts opts
|
44
|
+
puts ""
|
45
|
+
exit
|
46
|
+
}
|
47
|
+
opts.parse!
|
48
|
+
end
|
49
|
+
|
50
|
+
# Check if everything is correct
|
51
|
+
|
52
|
+
showError("Parser file not provided!\nexpected: bitreaper <parser> [options]") if ARGV.count == 0
|
53
|
+
showError("Too many arguments provided!\nexpected: bitreaper <parser> [options]") if ARGV.count > 1
|
54
|
+
showError("Both input file and single url provided!\nexpected: one of the two options") if $url!="" and $inputFile!=""
|
55
|
+
|
56
|
+
$parserFile = ARGV[0]
|
57
|
+
showError("Parser file does not exist!\npath given: #{$parserFile}") unless File.exist? $parserFile
|
58
|
+
|
59
|
+
if $inputFile!=""
|
60
|
+
showError("Input file does not exist!\npath given: #{$inputFile}") unless File.exist? $inputFile
|
61
|
+
end
|
62
|
+
|
63
|
+
# Show our logo
|
64
|
+
|
65
|
+
printLogo()
|
66
|
+
|
67
|
+
# Read parser
|
68
|
+
|
69
|
+
showInfo("reading parser: #{$parserFile}")
|
70
|
+
$parser = BitReaper.getParser($parserFile)
|
71
|
+
|
72
|
+
if $inputFile!=""
|
73
|
+
|
74
|
+
# Read urls
|
75
|
+
showInfo("reading urls: #{$inputFile}")
|
76
|
+
$urls = File.read($inputFile)
|
77
|
+
$urls = $urls.split("\n")
|
78
|
+
|
79
|
+
# Initial setup
|
80
|
+
$total = $urls.count
|
81
|
+
puts "\n"
|
82
|
+
|
83
|
+
# Process project
|
84
|
+
$store = []
|
85
|
+
$urls.each_with_index{|url,i|
|
86
|
+
br = BitReaper.new(url,$parser,i)
|
87
|
+
$store << br.process()
|
88
|
+
}
|
89
|
+
|
90
|
+
# Save store to file
|
91
|
+
saveStoreToFile($outputFile,$store)
|
92
|
+
|
93
|
+
else
|
94
|
+
|
95
|
+
# Initial setup
|
96
|
+
$total = 1
|
97
|
+
puts "\n"
|
98
|
+
|
99
|
+
# Process single url
|
100
|
+
br = BitReaper.new($url,$parser,1)
|
101
|
+
$store = br.process()
|
102
|
+
|
103
|
+
# Save store to file
|
104
|
+
saveStoreToFile($outputFile,$store)
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
#######################################################
|
109
|
+
#
|
110
|
+
# This is the end;
|
111
|
+
# my only friend, the end...
|
112
|
+
#
|
113
|
+
#######################################################
|
data/lib/bitreaper.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
#######################################################
|
2
|
+
# BitReaper
|
3
|
+
# Automated Web-Scraping Client for Ruby
|
4
|
+
#
|
5
|
+
# (c)2020 Yanis Zafirópulos
|
6
|
+
# aka Dr.Kameleon
|
7
|
+
#
|
8
|
+
# <yaniszaf@gmail.com>
|
9
|
+
#######################################################
|
10
|
+
# @file lib/bitreaper.rb
|
11
|
+
#######################################################
|
12
|
+
|
13
|
+
require 'awesome_print'
|
14
|
+
require 'colorize'
|
15
|
+
require 'json'
|
16
|
+
require 'liquid'
|
17
|
+
require 'nokogiri'
|
18
|
+
require 'open-uri'
|
19
|
+
require 'sdl4r'
|
20
|
+
require 'watir'
|
21
|
+
require 'webdrivers'
|
22
|
+
|
23
|
+
require_relative 'bitreaper/helpers.rb'
|
24
|
+
|
25
|
+
##########################################
|
26
|
+
# SUPERGLOBALS
|
27
|
+
##########################################
|
28
|
+
|
29
|
+
$bitreaper_version = 0.1
|
30
|
+
|
31
|
+
##########################################
|
32
|
+
# MAIN CLASS
|
33
|
+
##########################################
|
34
|
+
|
35
|
+
class BitReaper
|
36
|
+
def initialize(url,parser,i=0)
|
37
|
+
@url = url
|
38
|
+
@parser = (parser.is_a? String) ? self.getParser(parser) : parser
|
39
|
+
|
40
|
+
@index = i
|
41
|
+
@store = {}
|
42
|
+
|
43
|
+
@noko = self.download(@url)
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.getParser(file)
|
47
|
+
parserFile = File.read(file)
|
48
|
+
parserFile = parserFile.gsub(/([\w]+)\!\s/,'\1=on')
|
49
|
+
if true
|
50
|
+
puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
|
51
|
+
puts ""
|
52
|
+
end
|
53
|
+
|
54
|
+
return SDL4R::read(parserFile)
|
55
|
+
end
|
56
|
+
|
57
|
+
def download(url,withProgress=true)
|
58
|
+
printProgress(@url,@index,0) if withProgress
|
59
|
+
|
60
|
+
return Nokogiri::HTML(open(url))
|
61
|
+
end
|
62
|
+
|
63
|
+
def processStringValue(attrb,val,param)
|
64
|
+
case attrb
|
65
|
+
when "prepend"
|
66
|
+
val = param + val
|
67
|
+
when "append"
|
68
|
+
val = val + param
|
69
|
+
when "capitalize"
|
70
|
+
val = val.capitalize
|
71
|
+
when "uppercase"
|
72
|
+
val = val.upcase
|
73
|
+
when "lowercase"
|
74
|
+
val = val.downcase
|
75
|
+
when "trim"
|
76
|
+
val = val.strip
|
77
|
+
when "replace"
|
78
|
+
val = val.gsub(param[0], param[1])
|
79
|
+
when "remove"
|
80
|
+
val = val.gsub(param,"")
|
81
|
+
when "split"
|
82
|
+
val = val.split(param)
|
83
|
+
end
|
84
|
+
return val
|
85
|
+
end
|
86
|
+
|
87
|
+
def processArrayValue(attrb,val,param)
|
88
|
+
case attrb
|
89
|
+
when "join"
|
90
|
+
val = val.join(param)
|
91
|
+
when "first"
|
92
|
+
val = val.first
|
93
|
+
when "last"
|
94
|
+
val = val.last
|
95
|
+
when "index"
|
96
|
+
val = val[param.to_i]
|
97
|
+
when "select.include"
|
98
|
+
if param.start_with? "/"
|
99
|
+
val = val.select{|r| r=~Regexp.new(param.tr('/', '')) }
|
100
|
+
else
|
101
|
+
val = val.select{|r| r.include? param }
|
102
|
+
end
|
103
|
+
when "select.match"
|
104
|
+
if param.start_with? "/"
|
105
|
+
val = val.select{|r| r=~Regexp.new("\A#{param.tr('/','')}\Z") }
|
106
|
+
else
|
107
|
+
val = val.select{|r| r==param }
|
108
|
+
end
|
109
|
+
end
|
110
|
+
return val
|
111
|
+
end
|
112
|
+
|
113
|
+
def processValues(values,attrbs)
|
114
|
+
# check if we have a single value or an array of values
|
115
|
+
ret = (values.count==1) ? values[0].content
|
116
|
+
: values.map{|v| v.content}
|
117
|
+
|
118
|
+
# no attributes, just return it
|
119
|
+
return ret if attrbs.size==0
|
120
|
+
|
121
|
+
attrbs.each{|attrb,arg|
|
122
|
+
if arg.is_a? String
|
123
|
+
# get params if we have multiple params; or not
|
124
|
+
param = (arg.include? "||") ? (arg.split("||").map{|a| Liquid::Template.parse(a).render(@store) })
|
125
|
+
: Liquid::Template.parse(arg).render(@store)
|
126
|
+
end
|
127
|
+
|
128
|
+
if ret.is_a? String
|
129
|
+
# if our value is a String, process it accordingly
|
130
|
+
ret = self.processStringValue(attrb,ret,param)
|
131
|
+
else
|
132
|
+
# it's an array of values, so look for array-operating attributes
|
133
|
+
ret = self.processArrayValue(attrb,ret,param)
|
134
|
+
|
135
|
+
end
|
136
|
+
}
|
137
|
+
|
138
|
+
return (ret.nil?) ? "" : ret
|
139
|
+
end
|
140
|
+
|
141
|
+
def processNode(noko,node,store,level=0)
|
142
|
+
node.children.each{|child|
|
143
|
+
command = child.namespace
|
144
|
+
tag = child.name
|
145
|
+
pattern = child.values[0]
|
146
|
+
attrs = child.attributes
|
147
|
+
|
148
|
+
if child.children.count==0
|
149
|
+
# no children, so it's a "get"
|
150
|
+
values = noko.search(pattern)
|
151
|
+
|
152
|
+
if values.count>0
|
153
|
+
store[tag] = self.processValues(values, attrs)
|
154
|
+
end
|
155
|
+
else
|
156
|
+
# it's a "section"
|
157
|
+
store[tag] = {}
|
158
|
+
if pattern.nil?
|
159
|
+
subnoko = noko
|
160
|
+
else
|
161
|
+
subnoko = noko.search(pattern)
|
162
|
+
end
|
163
|
+
processNode(subnoko,child,store[tag],level+1)
|
164
|
+
end
|
165
|
+
}
|
166
|
+
end
|
167
|
+
|
168
|
+
def process
|
169
|
+
printProgress(@url,@index,1)
|
170
|
+
processNode(@noko, @parser, @store)
|
171
|
+
|
172
|
+
printProgress(@url,@index,2)
|
173
|
+
return @store
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
178
|
+
#######################################################
|
179
|
+
#
|
180
|
+
# This is the end;
|
181
|
+
# my only friend, the end...
|
182
|
+
#
|
183
|
+
#######################################################
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#######################################################
|
2
|
+
# BitReaper
|
3
|
+
# Automated Web-Scraping Client for Ruby
|
4
|
+
#
|
5
|
+
# (c)2020 Yanis Zafirópulos
|
6
|
+
# aka Dr.Kameleon
|
7
|
+
#
|
8
|
+
# <yaniszaf@gmail.com>
|
9
|
+
#######################################################
|
10
|
+
# @file lib/bitreaper/helpers.rb
|
11
|
+
#######################################################
|
12
|
+
|
13
|
+
##########################################
|
14
|
+
# HELPER FUNCTIONS
|
15
|
+
##########################################
|
16
|
+
|
17
|
+
class String
|
18
|
+
def ellipsisize(minimum_length=15,edge_length=15)
|
19
|
+
return self if self.length < minimum_length or self.length <= edge_length*2
|
20
|
+
edge = '.'*edge_length
|
21
|
+
mid_length = self.length - edge_length*2
|
22
|
+
gsub(/(#{edge}).{#{mid_length},}(#{edge})/, '\1...\2')
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def printLogo
|
27
|
+
puts (" ____ _ _ ____\n" +
|
28
|
+
" | __ )(_) |_| _ \\ ___ __ _ _ __ ___ _ __\n" +
|
29
|
+
" | _ \\| | __| |_) / _ \\/ _` | '_ \\ / _ \\ '__|\n" +
|
30
|
+
" | |_) | | |_| _ < __/ (_| | |_) | __/ |\n" +
|
31
|
+
" |____/|_|\\__|_| \\_\\___|\\__,_| .__/ \\___|_|\n").light_cyan.bold +
|
32
|
+
|
33
|
+
" (c) 2020, Dr.Kameleon".cyan + " |_| ".light_cyan.bold
|
34
|
+
|
35
|
+
puts ""
|
36
|
+
end
|
37
|
+
|
38
|
+
def showError(msg)
|
39
|
+
puts " BitReaper v0.1.0\n".bold +
|
40
|
+
" (c)2020 Dr.Kameleon\n"
|
41
|
+
puts "-" * 90
|
42
|
+
print " ✘ ERROR: ".light_red.bold
|
43
|
+
puts msg.split("\n").join("\n ")
|
44
|
+
puts "-" * 90
|
45
|
+
puts ""
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
|
49
|
+
def showInfo(msg)
|
50
|
+
puts " ● " + msg
|
51
|
+
end
|
52
|
+
|
53
|
+
def showSuccess(msg)
|
54
|
+
puts (" ● " + msg).light_green.bold
|
55
|
+
puts ""
|
56
|
+
end
|
57
|
+
|
58
|
+
def printProgress(item,indx,stage)
|
59
|
+
case stage
|
60
|
+
when 0
|
61
|
+
msg = "Downloading..."
|
62
|
+
when 1
|
63
|
+
msg = "Processing... "
|
64
|
+
when 2
|
65
|
+
msg = "OK ✔︎ ".light_green.bold
|
66
|
+
msg += "\n"
|
67
|
+
end
|
68
|
+
|
69
|
+
print "\r"
|
70
|
+
print " ► [#{indx+1}/#{$total}] ".bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
|
71
|
+
end
|
72
|
+
|
73
|
+
def saveStoreToFile(file,store)
|
74
|
+
puts ""
|
75
|
+
showInfo("finished processing #{$total} entries")
|
76
|
+
showInfo("saving to file: " + file)
|
77
|
+
puts "\n"
|
78
|
+
|
79
|
+
File.open(file,"w"){|f|
|
80
|
+
f.write(JSON.pretty_generate(store))
|
81
|
+
}
|
82
|
+
|
83
|
+
showSuccess("SUCCESS :)")
|
84
|
+
end
|
85
|
+
|
86
|
+
#######################################################
|
87
|
+
#
|
88
|
+
# This is the end;
|
89
|
+
# my only friend, the end...
|
90
|
+
#
|
91
|
+
#######################################################
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bitreaper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dr.Kameleon
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-04-09 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Automated Web-Scraping Client for Ruby
|
14
|
+
email: yaniszaf@gmail.com
|
15
|
+
executables:
|
16
|
+
- bitreaper
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/bitreaper
|
21
|
+
- lib/bitreaper.rb
|
22
|
+
- lib/bitreaper/helpers.rb
|
23
|
+
homepage: https://rubygems.org/gems/bitreaper
|
24
|
+
licenses:
|
25
|
+
- MIT
|
26
|
+
metadata: {}
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubygems_version: 3.0.4
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: BitReaper
|
46
|
+
test_files: []
|