bitreaper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ef322874a2e0f8e3876e554671d80ff724d6fb598be0c7e25be992c268bbac5a
4
+ data.tar.gz: c6c21e747415b48c0246cf07bb95d4512238d358859699593235365a69c0f752
5
+ SHA512:
6
+ metadata.gz: 3c7fd3310b52989dab3299294944c507ca39cecffd02544ce6c4a9f95a2db2016dee184c9bbdb5848ba5411297daa021abb9ff1c010ee2250976561fd3af5f08
7
+ data.tar.gz: 6d0294a916a199ddf934ad873b7943a2a8106e1c4209b5d2125c26fd5f2fb882d5f8730c24495e0286687b22aeda8329301359b6c11761735c354dabde60a81f
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env ruby
2
+ #######################################################
3
+ # BitReaper
4
+ # Automated Web-Scraping Client for Ruby
5
+ #
6
+ # (c)2020 Yanis Zafirópulos
7
+ # aka Dr.Kameleon
8
+ #
9
+ # <yaniszaf@gmail.com>
10
+ #######################################################
11
+ # @file bin/bitreaper
12
+ #######################################################
13
+
14
+ require 'colorize'
15
+ require 'optparse'
16
+
17
+ require 'bitreaper'
18
+
19
+ ##########################################
20
+ # ENTRY
21
+ ##########################################
22
+
23
+ # Set defaults
24
+
25
+ $url = ""
26
+ $inputFile = ""
27
+ $outputFile = "output.json"
28
+
29
+ $verbose = false
30
+
31
+ # Parse command-line options
32
+
33
+ ARGV.options do |opts|
34
+ opts.banner = "BitReaper v0.1.0\n".bold +
35
+ "(c)2020 Dr.Kameleon\n\n"+
36
+ "Usage: bitreaper <parser> [options]\n\n"
37
+
38
+ opts.on("-iINPUT", "--input==INPUT",String,"Set input file") {|val| $inputFile = val }
39
+ opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output file") {|val| $outputFile = val }
40
+ opts.on("-uURL", "--url=URL",String,"Set single url input") {|val| $url = val }
41
+ opts.on("-v", "--verbose", String, "Print debug messages") {|val| $verbose = true }
42
+ opts.on_tail("-h", "--help","Show this help page") {
43
+ puts opts
44
+ puts ""
45
+ exit
46
+ }
47
+ opts.parse!
48
+ end
49
+
50
+ # Check if everything is correct
51
+
52
+ showError("Parser file not provided!\nexpected: bitreaper <parser> [options]") if ARGV.count == 0
53
+ showError("Too many arguments provided!\nexpected: bitreaper <parser> [options]") if ARGV.count > 1
54
+ showError("Both input file and single url provided!\nexpected: one of the two options") if $url!="" and $inputFile!=""
55
+
56
+ $parserFile = ARGV[0]
57
+ showError("Parser file does not exist!\npath given: #{$parserFile}") unless File.exist? $parserFile
58
+
59
+ if $inputFile!=""
60
+ showError("Input file does not exist!\npath given: #{$inputFile}") unless File.exist? $inputFile
61
+ end
62
+
63
+ # Show our logo
64
+
65
+ printLogo()
66
+
67
+ # Read parser
68
+
69
+ showInfo("reading parser: #{$parserFile}")
70
+ $parser = BitReaper.getParser($parserFile)
71
+
72
+ if $inputFile!=""
73
+
74
+ # Read urls
75
+ showInfo("reading urls: #{$inputFile}")
76
+ $urls = File.read($inputFile)
77
+ $urls = $urls.split("\n")
78
+
79
+ # Initial setup
80
+ $total = $urls.count
81
+ puts "\n"
82
+
83
+ # Process project
84
+ $store = []
85
+ $urls.each_with_index{|url,i|
86
+ br = BitReaper.new(url,$parser,i)
87
+ $store << br.process()
88
+ }
89
+
90
+ # Save store to file
91
+ saveStoreToFile($outputFile,$store)
92
+
93
+ else
94
+
95
+ # Initial setup
96
+ $total = 1
97
+ puts "\n"
98
+
99
+ # Process single url
100
+ br = BitReaper.new($url,$parser,1)
101
+ $store = br.process()
102
+
103
+ # Save store to file
104
+ saveStoreToFile($outputFile,$store)
105
+
106
+ end
107
+
108
+ #######################################################
109
+ #
110
+ # This is the end;
111
+ # my only friend, the end...
112
+ #
113
+ #######################################################
@@ -0,0 +1,183 @@
1
+ #######################################################
2
+ # BitReaper
3
+ # Automated Web-Scraping Client for Ruby
4
+ #
5
+ # (c)2020 Yanis Zafirópulos
6
+ # aka Dr.Kameleon
7
+ #
8
+ # <yaniszaf@gmail.com>
9
+ #######################################################
10
+ # @file lib/bitreaper.rb
11
+ #######################################################
12
+
13
+ require 'awesome_print'
14
+ require 'colorize'
15
+ require 'json'
16
+ require 'liquid'
17
+ require 'nokogiri'
18
+ require 'open-uri'
19
+ require 'sdl4r'
20
+ require 'watir'
21
+ require 'webdrivers'
22
+
23
+ require_relative 'bitreaper/helpers.rb'
24
+
25
+ ##########################################
26
+ # SUPERGLOBALS
27
+ ##########################################
28
+
29
+ $bitreaper_version = 0.1
30
+
31
+ ##########################################
32
+ # MAIN CLASS
33
+ ##########################################
34
+
35
+ class BitReaper
36
+ def initialize(url,parser,i=0)
37
+ @url = url
38
+ @parser = (parser.is_a? String) ? self.getParser(parser) : parser
39
+
40
+ @index = i
41
+ @store = {}
42
+
43
+ @noko = self.download(@url)
44
+ end
45
+
46
+ def self.getParser(file)
47
+ parserFile = File.read(file)
48
+ parserFile = parserFile.gsub(/([\w]+)\!\s/,'\1=on')
49
+ if true
50
+ puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
51
+ puts ""
52
+ end
53
+
54
+ return SDL4R::read(parserFile)
55
+ end
56
+
57
+ def download(url,withProgress=true)
58
+ printProgress(@url,@index,0) if withProgress
59
+
60
+ return Nokogiri::HTML(open(url))
61
+ end
62
+
63
+ def processStringValue(attrb,val,param)
64
+ case attrb
65
+ when "prepend"
66
+ val = param + val
67
+ when "append"
68
+ val = val + param
69
+ when "capitalize"
70
+ val = val.capitalize
71
+ when "uppercase"
72
+ val = val.upcase
73
+ when "lowercase"
74
+ val = val.downcase
75
+ when "trim"
76
+ val = val.strip
77
+ when "replace"
78
+ val = val.gsub(param[0], param[1])
79
+ when "remove"
80
+ val = val.gsub(param,"")
81
+ when "split"
82
+ val = val.split(param)
83
+ end
84
+ return val
85
+ end
86
+
87
+ def processArrayValue(attrb,val,param)
88
+ case attrb
89
+ when "join"
90
+ val = val.join(param)
91
+ when "first"
92
+ val = val.first
93
+ when "last"
94
+ val = val.last
95
+ when "index"
96
+ val = val[param.to_i]
97
+ when "select.include"
98
+ if param.start_with? "/"
99
+ val = val.select{|r| r=~Regexp.new(param.tr('/', '')) }
100
+ else
101
+ val = val.select{|r| r.include? param }
102
+ end
103
+ when "select.match"
104
+ if param.start_with? "/"
105
+ val = val.select{|r| r=~Regexp.new("\A#{param.tr('/','')}\Z") }
106
+ else
107
+ val = val.select{|r| r==param }
108
+ end
109
+ end
110
+ return val
111
+ end
112
+
113
+ def processValues(values,attrbs)
114
+ # check if we have a single value or an array of values
115
+ ret = (values.count==1) ? values[0].content
116
+ : values.map{|v| v.content}
117
+
118
+ # no attributes, just return it
119
+ return ret if attrbs.size==0
120
+
121
+ attrbs.each{|attrb,arg|
122
+ if arg.is_a? String
123
+ # get params if we have multiple params; or not
124
+ param = (arg.include? "||") ? (arg.split("||").map{|a| Liquid::Template.parse(a).render(@store) })
125
+ : Liquid::Template.parse(arg).render(@store)
126
+ end
127
+
128
+ if ret.is_a? String
129
+ # if our value is a String, process it accordingly
130
+ ret = self.processStringValue(attrb,ret,param)
131
+ else
132
+ # it's an array of values, so look for array-operating attributes
133
+ ret = self.processArrayValue(attrb,ret,param)
134
+
135
+ end
136
+ }
137
+
138
+ return (ret.nil?) ? "" : ret
139
+ end
140
+
141
+ def processNode(noko,node,store,level=0)
142
+ node.children.each{|child|
143
+ command = child.namespace
144
+ tag = child.name
145
+ pattern = child.values[0]
146
+ attrs = child.attributes
147
+
148
+ if child.children.count==0
149
+ # no children, so it's a "get"
150
+ values = noko.search(pattern)
151
+
152
+ if values.count>0
153
+ store[tag] = self.processValues(values, attrs)
154
+ end
155
+ else
156
+ # it's a "section"
157
+ store[tag] = {}
158
+ if pattern.nil?
159
+ subnoko = noko
160
+ else
161
+ subnoko = noko.search(pattern)
162
+ end
163
+ processNode(subnoko,child,store[tag],level+1)
164
+ end
165
+ }
166
+ end
167
+
168
+ def process
169
+ printProgress(@url,@index,1)
170
+ processNode(@noko, @parser, @store)
171
+
172
+ printProgress(@url,@index,2)
173
+ return @store
174
+ end
175
+
176
+ end
177
+
178
+ #######################################################
179
+ #
180
+ # This is the end;
181
+ # my only friend, the end...
182
+ #
183
+ #######################################################
@@ -0,0 +1,91 @@
1
+ #######################################################
2
+ # BitReaper
3
+ # Automated Web-Scraping Client for Ruby
4
+ #
5
+ # (c)2020 Yanis Zafirópulos
6
+ # aka Dr.Kameleon
7
+ #
8
+ # <yaniszaf@gmail.com>
9
+ #######################################################
10
+ # @file lib/bitreaper/helpers.rb
11
+ #######################################################
12
+
13
+ ##########################################
14
+ # HELPER FUNCTIONS
15
+ ##########################################
16
+
17
+ class String
18
+ def ellipsisize(minimum_length=15,edge_length=15)
19
+ return self if self.length < minimum_length or self.length <= edge_length*2
20
+ edge = '.'*edge_length
21
+ mid_length = self.length - edge_length*2
22
+ gsub(/(#{edge}).{#{mid_length},}(#{edge})/, '\1...\2')
23
+ end
24
+ end
25
+
26
+ def printLogo
27
+ puts (" ____ _ _ ____\n" +
28
+ " | __ )(_) |_| _ \\ ___ __ _ _ __ ___ _ __\n" +
29
+ " | _ \\| | __| |_) / _ \\/ _` | '_ \\ / _ \\ '__|\n" +
30
+ " | |_) | | |_| _ < __/ (_| | |_) | __/ |\n" +
31
+ " |____/|_|\\__|_| \\_\\___|\\__,_| .__/ \\___|_|\n").light_cyan.bold +
32
+
33
+ " (c) 2020, Dr.Kameleon".cyan + " |_| ".light_cyan.bold
34
+
35
+ puts ""
36
+ end
37
+
38
+ def showError(msg)
39
+ puts " BitReaper v0.1.0\n".bold +
40
+ " (c)2020 Dr.Kameleon\n"
41
+ puts "-" * 90
42
+ print " ✘ ERROR: ".light_red.bold
43
+ puts msg.split("\n").join("\n ")
44
+ puts "-" * 90
45
+ puts ""
46
+ exit
47
+ end
48
+
49
+ def showInfo(msg)
50
+ puts " ● " + msg
51
+ end
52
+
53
+ def showSuccess(msg)
54
+ puts (" ● " + msg).light_green.bold
55
+ puts ""
56
+ end
57
+
58
+ def printProgress(item,indx,stage)
59
+ case stage
60
+ when 0
61
+ msg = "Downloading..."
62
+ when 1
63
+ msg = "Processing... "
64
+ when 2
65
+ msg = "OK ✔︎ ".light_green.bold
66
+ msg += "\n"
67
+ end
68
+
69
+ print "\r"
70
+ print " ► [#{indx+1}/#{$total}] ".bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
71
+ end
72
+
73
+ def saveStoreToFile(file,store)
74
+ puts ""
75
+ showInfo("finished processing #{$total} entries")
76
+ showInfo("saving to file: " + file)
77
+ puts "\n"
78
+
79
+ File.open(file,"w"){|f|
80
+ f.write(JSON.pretty_generate(store))
81
+ }
82
+
83
+ showSuccess("SUCCESS :)")
84
+ end
85
+
86
+ #######################################################
87
+ #
88
+ # This is the end;
89
+ # my only friend, the end...
90
+ #
91
+ #######################################################
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bitreaper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Dr.Kameleon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-04-09 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Automated Web-Scraping Client for Ruby
14
+ email: yaniszaf@gmail.com
15
+ executables:
16
+ - bitreaper
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - bin/bitreaper
21
+ - lib/bitreaper.rb
22
+ - lib/bitreaper/helpers.rb
23
+ homepage: https://rubygems.org/gems/bitreaper
24
+ licenses:
25
+ - MIT
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubygems_version: 3.0.4
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: BitReaper
46
+ test_files: []