bitreaper 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ef322874a2e0f8e3876e554671d80ff724d6fb598be0c7e25be992c268bbac5a
4
- data.tar.gz: c6c21e747415b48c0246cf07bb95d4512238d358859699593235365a69c0f752
3
+ metadata.gz: 806186133e8f475e8040fc7bee4e676d49665f83f16bcb127c171b7239e0aa94
4
+ data.tar.gz: 693e9fbe65d0b4e697c9cf7eaa871ed905b35509b8a0ff7dd0953933c8b77635
5
5
  SHA512:
6
- metadata.gz: 3c7fd3310b52989dab3299294944c507ca39cecffd02544ce6c4a9f95a2db2016dee184c9bbdb5848ba5411297daa021abb9ff1c010ee2250976561fd3af5f08
7
- data.tar.gz: 6d0294a916a199ddf934ad873b7943a2a8106e1c4209b5d2125c26fd5f2fb882d5f8730c24495e0286687b22aeda8329301359b6c11761735c354dabde60a81f
6
+ metadata.gz: 3080d190593d846cb6cc225c9e775e3899725a10e8785b7d506ba931d12203bdf76f7b0fe533f09b6b074761c1df9f0d46a80c513849377b1ec0cc8b49fdf4f6
7
+ data.tar.gz: 84a2212ce1ab9c9ccd3a43cec0bcff53d3b6ff814d56bc833488668193b7e8ffc80b18732a07b0c5409fa392e645c8c7de5a58f2f052da3bcee772adffe26dbe
@@ -13,6 +13,8 @@
13
13
 
14
14
  require 'colorize'
15
15
  require 'optparse'
16
+ require 'parallel'
17
+ require 'ruby-progressbar'
16
18
 
17
19
  require 'bitreaper'
18
20
 
@@ -24,8 +26,9 @@ require 'bitreaper'
24
26
 
25
27
  $url = ""
26
28
  $inputFile = ""
27
- $outputFile = "output.json"
29
+ $outputDest = "output"
28
30
 
31
+ $parallel = false
29
32
  $verbose = false
30
33
 
31
34
  # Parse command-line options
@@ -36,9 +39,10 @@ ARGV.options do |opts|
36
39
  "Usage: bitreaper <parser> [options]\n\n"
37
40
 
38
41
  opts.on("-iINPUT", "--input==INPUT",String,"Set input file") {|val| $inputFile = val }
39
- opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output file") {|val| $outputFile = val }
42
+ opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output name") {|val| $outputDest = val }
40
43
  opts.on("-uURL", "--url=URL",String,"Set single url input") {|val| $url = val }
41
- opts.on("-v", "--verbose", String, "Print debug messages") {|val| $verbose = true }
44
+ opts.on("-p", "--parallel", "Perform multi-threaded processing"){|val| $parallel = true }
45
+ opts.on("-v", "--verbose", "Print debug messages") {|val| $verbose = true }
42
46
  opts.on_tail("-h", "--help","Show this help page") {
43
47
  puts opts
44
48
  puts ""
@@ -82,13 +86,27 @@ if $inputFile!=""
82
86
 
83
87
  # Process project
84
88
  $store = []
85
- $urls.each_with_index{|url,i|
86
- br = BitReaper.new(url,$parser,i)
87
- $store << br.process()
88
- }
89
+ if not $parallel
90
+ $urls.each_with_index{|url,i|
91
+ br = BitReaper.new(url,$parser,i)
92
+ $store << br.process()
93
+ }
94
+ else
95
+ if not $verbose
96
+ Parallel.each_with_index($urls, in_threads: 4, progress: " ► Processing... "){|url,i|
97
+ br = BitReaper.new(url,$parser,i)
98
+ $store << br.process()
99
+ }
100
+ else
101
+ Parallel.each_with_index($urls, in_threads: 4){|url,i|
102
+ br = BitReaper.new(url,$parser,i)
103
+ $store << br.process()
104
+ }
105
+ end
106
+ end
89
107
 
90
108
  # Save store to file
91
- saveStoreToFile($outputFile,$store)
109
+ saveStoreToFile("#{$outputDest}.json",$store)
92
110
 
93
111
  else
94
112
 
@@ -101,7 +119,7 @@ else
101
119
  $store = br.process()
102
120
 
103
121
  # Save store to file
104
- saveStoreToFile($outputFile,$store)
122
+ saveStoreToFile("#{$outputDest}.json",$store)
105
123
 
106
124
  end
107
125
 
@@ -26,13 +26,22 @@ require_relative 'bitreaper/helpers.rb'
26
26
  # SUPERGLOBALS
27
27
  ##########################################
28
28
 
29
- $bitreaper_version = 0.1
29
+ $bitreaper_version = 0.1.2
30
30
 
31
- ##########################################
32
- # MAIN CLASS
33
- ##########################################
31
+ ####################################################################################
32
+ # **MAIN CLASS**
33
+ # This is the main Web Scraper object. It is through a `BitScraper` instance
34
+ # that you can start scraping
35
+ ####################################################################################
34
36
 
35
37
  class BitReaper
38
+
39
+ # Create a new BitReaper instance
40
+ #
41
+ # @param [String] url The URL of the page to be scraped
42
+ # @param [String,SDL4R::Tag] parser The parser
43
+ # @param [Integer] i Index of the current operation (for reporting purposes)
44
+ #---------------------------------------------------------------------------
36
45
  def initialize(url,parser,i=0)
37
46
  @url = url
38
47
  @parser = (parser.is_a? String) ? self.getParser(parser) : parser
@@ -43,10 +52,16 @@ class BitReaper
43
52
  @noko = self.download(@url)
44
53
  end
45
54
 
55
+ # Get a new parser from a given parser path
56
+ #
57
+ # @param [String] file The path of the `.br` parser file
58
+ #
59
+ # @return [SDL4R::Tag] The resulting parser
60
+ #---------------------------------------------------------------------------
46
61
  def self.getParser(file)
47
62
  parserFile = File.read(file)
48
63
  parserFile = parserFile.gsub(/([\w]+)\!\s/,'\1=on')
49
- if true
64
+ if $verbose
50
65
  puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
51
66
  puts ""
52
67
  end
@@ -54,12 +69,38 @@ class BitReaper
54
69
  return SDL4R::read(parserFile)
55
70
  end
56
71
 
72
+ # Process current project
73
+ #---------------------------------------------------------------------------
74
+ def process
75
+ printProgress(@url,@index,1)
76
+ processNode(@noko, @parser, @store)
77
+
78
+ printProgress(@url,@index,2)
79
+ return @store
80
+ end
81
+
82
+ private
83
+
84
+ # Download given URL
85
+ #
86
+ # @param [String] url The URL to be downloaded
87
+ #
88
+ # @return [Nokogiri::XML::NodeSet] The resulting nodes
89
+ #---------------------------------------------------------------------------
57
90
  def download(url,withProgress=true)
58
91
  printProgress(@url,@index,0) if withProgress
59
92
 
60
93
  return Nokogiri::HTML(open(url))
61
94
  end
62
95
 
96
+ # Process String value using attribute
97
+ #
98
+ # @param [String] attrb The attribute to be processed
99
+ # @param [String] val The value to processed
100
+ # @param [String] param The attribute's param (if any)
101
+ #
102
+ # @return [String,Array] The result of the operation
103
+ #---------------------------------------------------------------------------
63
104
  def processStringValue(attrb,val,param)
64
105
  case attrb
65
106
  when "prepend"
@@ -80,10 +121,21 @@ class BitReaper
80
121
  val = val.gsub(param,"")
81
122
  when "split"
82
123
  val = val.split(param)
124
+ when "download"
125
+ val = val
126
+ val.downloadAs($outputDest,(param.is_a? String) ? param : nil)
83
127
  end
84
128
  return val
85
129
  end
86
130
 
131
+ # Process Array value using attribute
132
+ #
133
+ # @param [String] attrb The attribute to be processed
134
+ # @param [Array] val The value to processed
135
+ # @param [String] param The attribute's param (if any)
136
+ #
137
+ # @return [String,Array] The result of the operation
138
+ #---------------------------------------------------------------------------
87
139
  def processArrayValue(attrb,val,param)
88
140
  case attrb
89
141
  when "join"
@@ -110,6 +162,13 @@ class BitReaper
110
162
  return val
111
163
  end
112
164
 
165
+ # Process parsed values using set of attributes
166
+ #
167
+ # @param [Array] values The parsed values
168
+ # @param [Array] attrbs The associated attributes
169
+ #
170
+ # @return [String,Array] The result of the operation
171
+ #---------------------------------------------------------------------------
113
172
  def processValues(values,attrbs)
114
173
  # check if we have a single value or an array of values
115
174
  ret = (values.count==1) ? values[0].content
@@ -138,6 +197,13 @@ class BitReaper
138
197
  return (ret.nil?) ? "" : ret
139
198
  end
140
199
 
200
+ # Process a given node using provided parser and temporary storage hash
201
+ #
202
+ # @param [Nokogiri::XML::node] noko The Nokogiri node to work on
203
+ # @param [SDL4R::Tag] node The parser node
204
+ # @param [Hash] store The temporary storage hash
205
+ # @param [Integer] level The nesting level (for informational purposes)
206
+ #---------------------------------------------------------------------------
141
207
  def processNode(noko,node,store,level=0)
142
208
  node.children.each{|child|
143
209
  command = child.namespace
@@ -165,14 +231,6 @@ class BitReaper
165
231
  }
166
232
  end
167
233
 
168
- def process
169
- printProgress(@url,@index,1)
170
- processNode(@noko, @parser, @store)
171
-
172
- printProgress(@url,@index,2)
173
- return @store
174
- end
175
-
176
234
  end
177
235
 
178
236
  #######################################################
@@ -10,10 +10,15 @@
10
10
  # @file lib/bitreaper/helpers.rb
11
11
  #######################################################
12
12
 
13
+ require 'down'
14
+ require 'fileutils'
15
+
13
16
  ##########################################
14
17
  # HELPER FUNCTIONS
15
18
  ##########################################
16
19
 
20
+ ## Misc
21
+
17
22
  class String
18
23
  def ellipsisize(minimum_length=15,edge_length=15)
19
24
  return self if self.length < minimum_length or self.length <= edge_length*2
@@ -21,8 +26,30 @@ class String
21
26
  mid_length = self.length - edge_length*2
22
27
  gsub(/(#{edge}).{#{mid_length},}(#{edge})/, '\1...\2')
23
28
  end
29
+
30
+ def downloadAs(dest,filename=nil)
31
+ # make sure 'dest' path and any dir included in the filename path exist
32
+ FileUtils.mkdir_p dest unless Dir.exist? dest
33
+ if not filename.nil?
34
+ subdir = File.join(dest, File.dirname(filename))
35
+ FileUtils.mkdir_p subdir unless Dir.exist? subdir
36
+ end
37
+
38
+ # download it
39
+ tmpfile = Down.download(self)
40
+
41
+ # in case a filename is specified, save it like that
42
+ # otherwise, try using the original filaname
43
+ if not filename.nil?
44
+ FileUtils.mv(tmpfile.path, "#{dest}/#{filename}")
45
+ else
46
+ FileUtils.mv(tmpfile.path, "#{dest}/#{tmpfile.original_filename}")
47
+ end
48
+ end
24
49
  end
25
50
 
51
+ ## Core
52
+
26
53
  def printLogo
27
54
  puts (" ____ _ _ ____\n" +
28
55
  " | __ )(_) |_| _ \\ ___ __ _ _ __ ___ _ __\n" +
@@ -66,8 +93,15 @@ def printProgress(item,indx,stage)
66
93
  msg += "\n"
67
94
  end
68
95
 
69
- print "\r"
70
- print " ► [#{indx+1}/#{$total}] ".bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
96
+ toPrint = ("" + "[#{indx+1}/#{$total}] ".ljust(12)).bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
97
+ if $parallel
98
+ if $verbose
99
+ puts toPrint
100
+ end
101
+ else
102
+ print "\r"
103
+ print toPrint
104
+ end
71
105
  end
72
106
 
73
107
  def saveStoreToFile(file,store)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bitreaper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dr.Kameleon
@@ -10,8 +10,9 @@ bindir: bin
10
10
  cert_chain: []
11
11
  date: 2020-04-09 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Automated Web-Scraping Client for Ruby
14
- email: yaniszaf@gmail.com
13
+ description: Automated Web-Scraping Client for Ruby using SLD2-like configuration
14
+ files. Supports XPath and CSS selectors via Nokogiri.
15
+ email: drkameleon@gmail.com
15
16
  executables:
16
17
  - bitreaper
17
18
  extensions: []
@@ -20,7 +21,7 @@ files:
20
21
  - bin/bitreaper
21
22
  - lib/bitreaper.rb
22
23
  - lib/bitreaper/helpers.rb
23
- homepage: https://rubygems.org/gems/bitreaper
24
+ homepage: https://github.com/drkameleon/BitReaper
24
25
  licenses:
25
26
  - MIT
26
27
  metadata: {}