bitreaper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ef322874a2e0f8e3876e554671d80ff724d6fb598be0c7e25be992c268bbac5a
4
+ data.tar.gz: c6c21e747415b48c0246cf07bb95d4512238d358859699593235365a69c0f752
5
+ SHA512:
6
+ metadata.gz: 3c7fd3310b52989dab3299294944c507ca39cecffd02544ce6c4a9f95a2db2016dee184c9bbdb5848ba5411297daa021abb9ff1c010ee2250976561fd3af5f08
7
+ data.tar.gz: 6d0294a916a199ddf934ad873b7943a2a8106e1c4209b5d2125c26fd5f2fb882d5f8730c24495e0286687b22aeda8329301359b6c11761735c354dabde60a81f
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env ruby
2
+ #######################################################
3
+ # BitReaper
4
+ # Automated Web-Scraping Client for Ruby
5
+ #
6
+ # (c)2020 Yanis Zafirópulos
7
+ # aka Dr.Kameleon
8
+ #
9
+ # <yaniszaf@gmail.com>
10
+ #######################################################
11
+ # @file bin/bitreaper
12
+ #######################################################
13
+
14
+ require 'colorize'
15
+ require 'optparse'
16
+
17
+ require 'bitreaper'
18
+
19
+ ##########################################
20
+ # ENTRY
21
+ ##########################################
22
+
23
+ # Set defaults
24
+
25
+ $url = ""
26
+ $inputFile = ""
27
+ $outputFile = "output.json"
28
+
29
+ $verbose = false
30
+
31
+ # Parse command-line options
32
+
33
+ ARGV.options do |opts|
34
+ opts.banner = "BitReaper v0.1.0\n".bold +
35
+ "(c)2020 Dr.Kameleon\n\n"+
36
+ "Usage: bitreaper <parser> [options]\n\n"
37
+
38
+ opts.on("-iINPUT", "--input==INPUT",String,"Set input file") {|val| $inputFile = val }
39
+ opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output file") {|val| $outputFile = val }
40
+ opts.on("-uURL", "--url=URL",String,"Set single url input") {|val| $url = val }
41
+ opts.on("-v", "--verbose", String, "Print debug messages") {|val| $verbose = true }
42
+ opts.on_tail("-h", "--help","Show this help page") {
43
+ puts opts
44
+ puts ""
45
+ exit
46
+ }
47
+ opts.parse!
48
+ end
49
+
50
+ # Check if everything is correct
51
+
52
+ showError("Parser file not provided!\nexpected: bitreaper <parser> [options]") if ARGV.count == 0
53
+ showError("Too many arguments provided!\nexpected: bitreaper <parser> [options]") if ARGV.count > 1
54
+ showError("Both input file and single url provided!\nexpected: one of the two options") if $url!="" and $inputFile!=""
55
+
56
+ $parserFile = ARGV[0]
57
+ showError("Parser file does not exist!\npath given: #{$parserFile}") unless File.exist? $parserFile
58
+
59
+ if $inputFile!=""
60
+ showError("Input file does not exist!\npath given: #{$inputFile}") unless File.exist? $inputFile
61
+ end
62
+
63
+ # Show our logo
64
+
65
+ printLogo()
66
+
67
+ # Read parser
68
+
69
+ showInfo("reading parser: #{$parserFile}")
70
+ $parser = BitReaper.getParser($parserFile)
71
+
72
+ if $inputFile!=""
73
+
74
+ # Read urls
75
+ showInfo("reading urls: #{$inputFile}")
76
+ $urls = File.read($inputFile)
77
+ $urls = $urls.split("\n")
78
+
79
+ # Initial setup
80
+ $total = $urls.count
81
+ puts "\n"
82
+
83
+ # Process project
84
+ $store = []
85
+ $urls.each_with_index{|url,i|
86
+ br = BitReaper.new(url,$parser,i)
87
+ $store << br.process()
88
+ }
89
+
90
+ # Save store to file
91
+ saveStoreToFile($outputFile,$store)
92
+
93
+ else
94
+
95
+ # Initial setup
96
+ $total = 1
97
+ puts "\n"
98
+
99
+ # Process single url
100
+ br = BitReaper.new($url,$parser,1)
101
+ $store = br.process()
102
+
103
+ # Save store to file
104
+ saveStoreToFile($outputFile,$store)
105
+
106
+ end
107
+
108
+ #######################################################
109
+ #
110
+ # This is the end;
111
+ # my only friend, the end...
112
+ #
113
+ #######################################################
@@ -0,0 +1,183 @@
1
+ #######################################################
2
+ # BitReaper
3
+ # Automated Web-Scraping Client for Ruby
4
+ #
5
+ # (c)2020 Yanis Zafirópulos
6
+ # aka Dr.Kameleon
7
+ #
8
+ # <yaniszaf@gmail.com>
9
+ #######################################################
10
+ # @file lib/bitreaper.rb
11
+ #######################################################
12
+
13
+ require 'awesome_print'
14
+ require 'colorize'
15
+ require 'json'
16
+ require 'liquid'
17
+ require 'nokogiri'
18
+ require 'open-uri'
19
+ require 'sdl4r'
20
+ require 'watir'
21
+ require 'webdrivers'
22
+
23
+ require_relative 'bitreaper/helpers.rb'
24
+
25
+ ##########################################
26
+ # SUPERGLOBALS
27
+ ##########################################
28
+
29
+ $bitreaper_version = 0.1
30
+
31
+ ##########################################
32
+ # MAIN CLASS
33
+ ##########################################
34
+
35
+ class BitReaper
36
+ def initialize(url,parser,i=0)
37
+ @url = url
38
+ @parser = (parser.is_a? String) ? self.getParser(parser) : parser
39
+
40
+ @index = i
41
+ @store = {}
42
+
43
+ @noko = self.download(@url)
44
+ end
45
+
46
+ def self.getParser(file)
47
+ parserFile = File.read(file)
48
+ parserFile = parserFile.gsub(/([\w]+)\!\s/,'\1=on')
49
+ if true
50
+ puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
51
+ puts ""
52
+ end
53
+
54
+ return SDL4R::read(parserFile)
55
+ end
56
+
57
+ def download(url,withProgress=true)
58
+ printProgress(@url,@index,0) if withProgress
59
+
60
+ return Nokogiri::HTML(open(url))
61
+ end
62
+
63
+ def processStringValue(attrb,val,param)
64
+ case attrb
65
+ when "prepend"
66
+ val = param + val
67
+ when "append"
68
+ val = val + param
69
+ when "capitalize"
70
+ val = val.capitalize
71
+ when "uppercase"
72
+ val = val.upcase
73
+ when "lowercase"
74
+ val = val.downcase
75
+ when "trim"
76
+ val = val.strip
77
+ when "replace"
78
+ val = val.gsub(param[0], param[1])
79
+ when "remove"
80
+ val = val.gsub(param,"")
81
+ when "split"
82
+ val = val.split(param)
83
+ end
84
+ return val
85
+ end
86
+
87
+ def processArrayValue(attrb,val,param)
88
+ case attrb
89
+ when "join"
90
+ val = val.join(param)
91
+ when "first"
92
+ val = val.first
93
+ when "last"
94
+ val = val.last
95
+ when "index"
96
+ val = val[param.to_i]
97
+ when "select.include"
98
+ if param.start_with? "/"
99
+ val = val.select{|r| r=~Regexp.new(param.tr('/', '')) }
100
+ else
101
+ val = val.select{|r| r.include? param }
102
+ end
103
+ when "select.match"
104
+ if param.start_with? "/"
105
+ val = val.select{|r| r=~Regexp.new("\A#{param.tr('/','')}\Z") }
106
+ else
107
+ val = val.select{|r| r==param }
108
+ end
109
+ end
110
+ return val
111
+ end
112
+
113
+ def processValues(values,attrbs)
114
+ # check if we have a single value or an array of values
115
+ ret = (values.count==1) ? values[0].content
116
+ : values.map{|v| v.content}
117
+
118
+ # no attributes, just return it
119
+ return ret if attrbs.size==0
120
+
121
+ attrbs.each{|attrb,arg|
122
+ if arg.is_a? String
123
+ # get params if we have multiple params; or not
124
+ param = (arg.include? "||") ? (arg.split("||").map{|a| Liquid::Template.parse(a).render(@store) })
125
+ : Liquid::Template.parse(arg).render(@store)
126
+ end
127
+
128
+ if ret.is_a? String
129
+ # if our value is a String, process it accordingly
130
+ ret = self.processStringValue(attrb,ret,param)
131
+ else
132
+ # it's an array of values, so look for array-operating attributes
133
+ ret = self.processArrayValue(attrb,ret,param)
134
+
135
+ end
136
+ }
137
+
138
+ return (ret.nil?) ? "" : ret
139
+ end
140
+
141
+ def processNode(noko,node,store,level=0)
142
+ node.children.each{|child|
143
+ command = child.namespace
144
+ tag = child.name
145
+ pattern = child.values[0]
146
+ attrs = child.attributes
147
+
148
+ if child.children.count==0
149
+ # no children, so it's a "get"
150
+ values = noko.search(pattern)
151
+
152
+ if values.count>0
153
+ store[tag] = self.processValues(values, attrs)
154
+ end
155
+ else
156
+ # it's a "section"
157
+ store[tag] = {}
158
+ if pattern.nil?
159
+ subnoko = noko
160
+ else
161
+ subnoko = noko.search(pattern)
162
+ end
163
+ processNode(subnoko,child,store[tag],level+1)
164
+ end
165
+ }
166
+ end
167
+
168
+ def process
169
+ printProgress(@url,@index,1)
170
+ processNode(@noko, @parser, @store)
171
+
172
+ printProgress(@url,@index,2)
173
+ return @store
174
+ end
175
+
176
+ end
177
+
178
+ #######################################################
179
+ #
180
+ # This is the end;
181
+ # my only friend, the end...
182
+ #
183
+ #######################################################
@@ -0,0 +1,91 @@
1
+ #######################################################
2
+ # BitReaper
3
+ # Automated Web-Scraping Client for Ruby
4
+ #
5
+ # (c)2020 Yanis Zafirópulos
6
+ # aka Dr.Kameleon
7
+ #
8
+ # <yaniszaf@gmail.com>
9
+ #######################################################
10
+ # @file lib/bitreaper/helpers.rb
11
+ #######################################################
12
+
13
+ ##########################################
14
+ # HELPER FUNCTIONS
15
+ ##########################################
16
+
17
+ class String
18
+ def ellipsisize(minimum_length=15,edge_length=15)
19
+ return self if self.length < minimum_length or self.length <= edge_length*2
20
+ edge = '.'*edge_length
21
+ mid_length = self.length - edge_length*2
22
+ gsub(/(#{edge}).{#{mid_length},}(#{edge})/, '\1...\2')
23
+ end
24
+ end
25
+
26
+ def printLogo
27
+ puts (" ____ _ _ ____\n" +
28
+ " | __ )(_) |_| _ \\ ___ __ _ _ __ ___ _ __\n" +
29
+ " | _ \\| | __| |_) / _ \\/ _` | '_ \\ / _ \\ '__|\n" +
30
+ " | |_) | | |_| _ < __/ (_| | |_) | __/ |\n" +
31
+ " |____/|_|\\__|_| \\_\\___|\\__,_| .__/ \\___|_|\n").light_cyan.bold +
32
+
33
+ " (c) 2020, Dr.Kameleon".cyan + " |_| ".light_cyan.bold
34
+
35
+ puts ""
36
+ end
37
+
38
+ def showError(msg)
39
+ puts " BitReaper v0.1.0\n".bold +
40
+ " (c)2020 Dr.Kameleon\n"
41
+ puts "-" * 90
42
+ print " ✘ ERROR: ".light_red.bold
43
+ puts msg.split("\n").join("\n ")
44
+ puts "-" * 90
45
+ puts ""
46
+ exit
47
+ end
48
+
49
+ def showInfo(msg)
50
+ puts " ● " + msg
51
+ end
52
+
53
+ def showSuccess(msg)
54
+ puts (" ● " + msg).light_green.bold
55
+ puts ""
56
+ end
57
+
58
+ def printProgress(item,indx,stage)
59
+ case stage
60
+ when 0
61
+ msg = "Downloading..."
62
+ when 1
63
+ msg = "Processing... "
64
+ when 2
65
+ msg = "OK ✔︎ ".light_green.bold
66
+ msg += "\n"
67
+ end
68
+
69
+ print "\r"
70
+ print " ► [#{indx+1}/#{$total}] ".bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
71
+ end
72
+
73
+ def saveStoreToFile(file,store)
74
+ puts ""
75
+ showInfo("finished processing #{$total} entries")
76
+ showInfo("saving to file: " + file)
77
+ puts "\n"
78
+
79
+ File.open(file,"w"){|f|
80
+ f.write(JSON.pretty_generate(store))
81
+ }
82
+
83
+ showSuccess("SUCCESS :)")
84
+ end
85
+
86
+ #######################################################
87
+ #
88
+ # This is the end;
89
+ # my only friend, the end...
90
+ #
91
+ #######################################################
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bitreaper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Dr.Kameleon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-04-09 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Automated Web-Scraping Client for Ruby
14
+ email: yaniszaf@gmail.com
15
+ executables:
16
+ - bitreaper
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/bitreaper
21
+ - lib/bitreaper.rb
22
+ - lib/bitreaper/helpers.rb
23
+ homepage: https://rubygems.org/gems/bitreaper
24
+ licenses:
25
+ - MIT
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubygems_version: 3.0.4
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: BitReaper
46
+ test_files: []