bitreaper 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ef322874a2e0f8e3876e554671d80ff724d6fb598be0c7e25be992c268bbac5a
4
- data.tar.gz: c6c21e747415b48c0246cf07bb95d4512238d358859699593235365a69c0f752
3
+ metadata.gz: 806186133e8f475e8040fc7bee4e676d49665f83f16bcb127c171b7239e0aa94
4
+ data.tar.gz: 693e9fbe65d0b4e697c9cf7eaa871ed905b35509b8a0ff7dd0953933c8b77635
5
5
  SHA512:
6
- metadata.gz: 3c7fd3310b52989dab3299294944c507ca39cecffd02544ce6c4a9f95a2db2016dee184c9bbdb5848ba5411297daa021abb9ff1c010ee2250976561fd3af5f08
7
- data.tar.gz: 6d0294a916a199ddf934ad873b7943a2a8106e1c4209b5d2125c26fd5f2fb882d5f8730c24495e0286687b22aeda8329301359b6c11761735c354dabde60a81f
6
+ metadata.gz: 3080d190593d846cb6cc225c9e775e3899725a10e8785b7d506ba931d12203bdf76f7b0fe533f09b6b074761c1df9f0d46a80c513849377b1ec0cc8b49fdf4f6
7
+ data.tar.gz: 84a2212ce1ab9c9ccd3a43cec0bcff53d3b6ff814d56bc833488668193b7e8ffc80b18732a07b0c5409fa392e645c8c7de5a58f2f052da3bcee772adffe26dbe
@@ -13,6 +13,8 @@
13
13
 
14
14
  require 'colorize'
15
15
  require 'optparse'
16
+ require 'parallel'
17
+ require 'ruby-progressbar'
16
18
 
17
19
  require 'bitreaper'
18
20
 
@@ -24,8 +26,9 @@ require 'bitreaper'
24
26
 
25
27
  $url = ""
26
28
  $inputFile = ""
27
- $outputFile = "output.json"
29
+ $outputDest = "output"
28
30
 
31
+ $parallel = false
29
32
  $verbose = false
30
33
 
31
34
  # Parse command-line options
@@ -36,9 +39,10 @@ ARGV.options do |opts|
36
39
  "Usage: bitreaper <parser> [options]\n\n"
37
40
 
38
41
  opts.on("-iINPUT", "--input==INPUT",String,"Set input file") {|val| $inputFile = val }
39
- opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output file") {|val| $outputFile = val }
42
+ opts.on("-oOUTPUT", "--output=OUTPUT",String,"Set output name") {|val| $outputDest = val }
40
43
  opts.on("-uURL", "--url=URL",String,"Set single url input") {|val| $url = val }
41
- opts.on("-v", "--verbose", String, "Print debug messages") {|val| $verbose = true }
44
+ opts.on("-p", "--parallel", "Perform multi-threaded processing"){|val| $parallel = true }
45
+ opts.on("-v", "--verbose", "Print debug messages") {|val| $verbose = true }
42
46
  opts.on_tail("-h", "--help","Show this help page") {
43
47
  puts opts
44
48
  puts ""
@@ -82,13 +86,27 @@ if $inputFile!=""
82
86
 
83
87
  # Process project
84
88
  $store = []
85
- $urls.each_with_index{|url,i|
86
- br = BitReaper.new(url,$parser,i)
87
- $store << br.process()
88
- }
89
+ if not $parallel
90
+ $urls.each_with_index{|url,i|
91
+ br = BitReaper.new(url,$parser,i)
92
+ $store << br.process()
93
+ }
94
+ else
95
+ if not $verbose
96
+ Parallel.each_with_index($urls, in_threads: 4, progress: " ► Processing... "){|url,i|
97
+ br = BitReaper.new(url,$parser,i)
98
+ $store << br.process()
99
+ }
100
+ else
101
+ Parallel.each_with_index($urls, in_threads: 4){|url,i|
102
+ br = BitReaper.new(url,$parser,i)
103
+ $store << br.process()
104
+ }
105
+ end
106
+ end
89
107
 
90
108
  # Save store to file
91
- saveStoreToFile($outputFile,$store)
109
+ saveStoreToFile("#{$outputDest}.json",$store)
92
110
 
93
111
  else
94
112
 
@@ -101,7 +119,7 @@ else
101
119
  $store = br.process()
102
120
 
103
121
  # Save store to file
104
- saveStoreToFile($outputFile,$store)
122
+ saveStoreToFile("#{$outputDest}.json",$store)
105
123
 
106
124
  end
107
125
 
@@ -26,13 +26,22 @@ require_relative 'bitreaper/helpers.rb'
26
26
  # SUPERGLOBALS
27
27
  ##########################################
28
28
 
29
- $bitreaper_version = 0.1
29
+ $bitreaper_version = 0.1.2
30
30
 
31
- ##########################################
32
- # MAIN CLASS
33
- ##########################################
31
+ ####################################################################################
32
+ # **MAIN CLASS**
33
+ # This is the main Web Scraper object. It is through a `BitScraper` instance
34
+ # that you can start scraping
35
+ ####################################################################################
34
36
 
35
37
  class BitReaper
38
+
39
+ # Create a new BitReaper instance
40
+ #
41
+ # @param [String] url The URL of the page to be scraped
42
+ # @param [String,SDL4R::Tag] parser The parser
43
+ # @param [Integer] i Index of the current operation (for reporting purposes)
44
+ #---------------------------------------------------------------------------
36
45
  def initialize(url,parser,i=0)
37
46
  @url = url
38
47
  @parser = (parser.is_a? String) ? self.getParser(parser) : parser
@@ -43,10 +52,16 @@ class BitReaper
43
52
  @noko = self.download(@url)
44
53
  end
45
54
 
55
+ # Get a new parser from a given parser path
56
+ #
57
+ # @param [String] file The path of the `.br` parser file
58
+ #
59
+ # @return [SDL4R::Tag] The resulting parser
60
+ #---------------------------------------------------------------------------
46
61
  def self.getParser(file)
47
62
  parserFile = File.read(file)
48
63
  parserFile = parserFile.gsub(/([\w]+)\!\s/,'\1=on')
49
- if true
64
+ if $verbose
50
65
  puts parserFile.split("\n").map{|l| " "+l}.join("\n").light_black
51
66
  puts ""
52
67
  end
@@ -54,12 +69,38 @@ class BitReaper
54
69
  return SDL4R::read(parserFile)
55
70
  end
56
71
 
72
+ # Process current project
73
+ #---------------------------------------------------------------------------
74
+ def process
75
+ printProgress(@url,@index,1)
76
+ processNode(@noko, @parser, @store)
77
+
78
+ printProgress(@url,@index,2)
79
+ return @store
80
+ end
81
+
82
+ private
83
+
84
+ # Download given URL
85
+ #
86
+ # @param [String] url The URL to be downloaded
87
+ #
88
+ # @return [Nokogiri::XML::NodeSet] The resulting nodes
89
+ #---------------------------------------------------------------------------
57
90
  def download(url,withProgress=true)
58
91
  printProgress(@url,@index,0) if withProgress
59
92
 
60
93
  return Nokogiri::HTML(open(url))
61
94
  end
62
95
 
96
+ # Process String value using attribute
97
+ #
98
+ # @param [String] attrb The attribute to be processed
99
+ # @param [String] val The value to processed
100
+ # @param [String] param The attribute's param (if any)
101
+ #
102
+ # @return [String,Array] The result of the operation
103
+ #---------------------------------------------------------------------------
63
104
  def processStringValue(attrb,val,param)
64
105
  case attrb
65
106
  when "prepend"
@@ -80,10 +121,21 @@ class BitReaper
80
121
  val = val.gsub(param,"")
81
122
  when "split"
82
123
  val = val.split(param)
124
+ when "download"
125
+ val = val
126
+ val.downloadAs($outputDest,(param.is_a? String) ? param : nil)
83
127
  end
84
128
  return val
85
129
  end
86
130
 
131
+ # Process Array value using attribute
132
+ #
133
+ # @param [String] attrb The attribute to be processed
134
+ # @param [Array] val The value to processed
135
+ # @param [String] param The attribute's param (if any)
136
+ #
137
+ # @return [String,Array] The result of the operation
138
+ #---------------------------------------------------------------------------
87
139
  def processArrayValue(attrb,val,param)
88
140
  case attrb
89
141
  when "join"
@@ -110,6 +162,13 @@ class BitReaper
110
162
  return val
111
163
  end
112
164
 
165
+ # Process parsed values using set of attributes
166
+ #
167
+ # @param [Array] values The parsed values
168
+ # @param [Array] attrbs The associated attributes
169
+ #
170
+ # @return [String,Array] The result of the operation
171
+ #---------------------------------------------------------------------------
113
172
  def processValues(values,attrbs)
114
173
  # check if we have a single value or an array of values
115
174
  ret = (values.count==1) ? values[0].content
@@ -138,6 +197,13 @@ class BitReaper
138
197
  return (ret.nil?) ? "" : ret
139
198
  end
140
199
 
200
+ # Process a given node using provided parser and temporary storage hash
201
+ #
202
+ # @param [Nokogiri::XML::node] noko The Nokogiri node to work on
203
+ # @param [SDL4R::Tag] node The parser node
204
+ # @param [Hash] store The temporary storage hash
205
+ # @param [Integer] level The nesting level (for informational purposes)
206
+ #---------------------------------------------------------------------------
141
207
  def processNode(noko,node,store,level=0)
142
208
  node.children.each{|child|
143
209
  command = child.namespace
@@ -165,14 +231,6 @@ class BitReaper
165
231
  }
166
232
  end
167
233
 
168
- def process
169
- printProgress(@url,@index,1)
170
- processNode(@noko, @parser, @store)
171
-
172
- printProgress(@url,@index,2)
173
- return @store
174
- end
175
-
176
234
  end
177
235
 
178
236
  #######################################################
@@ -10,10 +10,15 @@
10
10
  # @file lib/bitreaper/helpers.rb
11
11
  #######################################################
12
12
 
13
+ require 'down'
14
+ require 'fileutils'
15
+
13
16
  ##########################################
14
17
  # HELPER FUNCTIONS
15
18
  ##########################################
16
19
 
20
+ ## Misc
21
+
17
22
  class String
18
23
  def ellipsisize(minimum_length=15,edge_length=15)
19
24
  return self if self.length < minimum_length or self.length <= edge_length*2
@@ -21,8 +26,30 @@ class String
21
26
  mid_length = self.length - edge_length*2
22
27
  gsub(/(#{edge}).{#{mid_length},}(#{edge})/, '\1...\2')
23
28
  end
29
+
30
+ def downloadAs(dest,filename=nil)
31
+ # make sure 'dest' path and any dir included in the filename path exist
32
+ FileUtils.mkdir_p dest unless Dir.exist? dest
33
+ if not filename.nil?
34
+ subdir = File.join(dest, File.dirname(filename))
35
+ FileUtils.mkdir_p subdir unless Dir.exist? subdir
36
+ end
37
+
38
+ # download it
39
+ tmpfile = Down.download(self)
40
+
41
+ # in case a filename is specified, save it like that
42
+ # otherwise, try using the original filaname
43
+ if not filename.nil?
44
+ FileUtils.mv(tmpfile.path, "#{dest}/#{filename}")
45
+ else
46
+ FileUtils.mv(tmpfile.path, "#{dest}/#{tmpfile.original_filename}")
47
+ end
48
+ end
24
49
  end
25
50
 
51
+ ## Core
52
+
26
53
  def printLogo
27
54
  puts (" ____ _ _ ____\n" +
28
55
  " | __ )(_) |_| _ \\ ___ __ _ _ __ ___ _ __\n" +
@@ -66,8 +93,15 @@ def printProgress(item,indx,stage)
66
93
  msg += "\n"
67
94
  end
68
95
 
69
- print "\r"
70
- print " ► [#{indx+1}/#{$total}] ".bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
96
+ toPrint = ("" + "[#{indx+1}/#{$total}] ".ljust(12)).bold + item.ellipsisize.light_magenta.underline + " ➔ " + msg
97
+ if $parallel
98
+ if $verbose
99
+ puts toPrint
100
+ end
101
+ else
102
+ print "\r"
103
+ print toPrint
104
+ end
71
105
  end
72
106
 
73
107
  def saveStoreToFile(file,store)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bitreaper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dr.Kameleon
@@ -10,8 +10,9 @@ bindir: bin
10
10
  cert_chain: []
11
11
  date: 2020-04-09 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Automated Web-Scraping Client for Ruby
14
- email: yaniszaf@gmail.com
13
+ description: Automated Web-Scraping Client for Ruby using SLD2-like configuration
14
+ files. Supports XPath and CSS selectors via Nokogiri.
15
+ email: drkameleon@gmail.com
15
16
  executables:
16
17
  - bitreaper
17
18
  extensions: []
@@ -20,7 +21,7 @@ files:
20
21
  - bin/bitreaper
21
22
  - lib/bitreaper.rb
22
23
  - lib/bitreaper/helpers.rb
23
- homepage: https://rubygems.org/gems/bitreaper
24
+ homepage: https://github.com/drkameleon/BitReaper
24
25
  licenses:
25
26
  - MIT
26
27
  metadata: {}