bitreaper 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bitreaper.rb +27 -29
- metadata +114 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8099d3d4a818b30b2ac24729cc564e0f30ca282fa0dd169961b4440f74e53d87
|
4
|
+
data.tar.gz: 507a81e5d915195358dc62c1cc8fddc86565778b97d02bf2cefe4951748d64f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a89b5f7e9c6e5bb7bbd019a3cab051a5ceea8f3cd5c195ab1a95c7a3c776279567180918ad3aebb4cd911d83d5ea80bc464e3661cb4314200433d3e5d483af7c
|
7
|
+
data.tar.gz: 07ad9cecd4e25faca388ce8e3fe05612a748189873f17acf21b07560099d1af8dff0acbc3ffe3556f99a1e6356c18a1b88fb54b7c7089fb9210112dcc127b7cd
|
data/lib/bitreaper.rb
CHANGED
@@ -17,31 +17,24 @@ require 'liquid'
|
|
17
17
|
require 'nokogiri'
|
18
18
|
require 'open-uri'
|
19
19
|
require 'sdl4r'
|
20
|
-
require 'watir'
|
21
|
-
require 'webdrivers'
|
22
20
|
|
23
21
|
require_relative 'bitreaper/helpers.rb'
|
24
22
|
|
25
|
-
|
26
|
-
# SUPERGLOBALS
|
27
|
-
##########################################
|
23
|
+
$bitreaper_version = "0.1.3"
|
28
24
|
|
29
|
-
|
30
|
-
|
31
|
-
####################################################################################
|
32
|
-
# **MAIN CLASS**
|
25
|
+
##
|
33
26
|
# This is the main Web Scraper object. It is through a `BitScraper` instance
|
34
27
|
# that you can start scraping
|
35
|
-
####################################################################################
|
36
28
|
|
37
29
|
class BitReaper
|
38
30
|
|
31
|
+
##
|
39
32
|
# Create a new BitReaper instance
|
40
33
|
#
|
41
34
|
# @param [String] url The URL of the page to be scraped
|
42
35
|
# @param [String,SDL4R::Tag] parser The parser
|
43
36
|
# @param [Integer] i Index of the current operation (for reporting purposes)
|
44
|
-
|
37
|
+
|
45
38
|
def initialize(url,parser,i=0)
|
46
39
|
@url = url
|
47
40
|
@parser = (parser.is_a? String) ? self.getParser(parser) : parser
|
@@ -52,12 +45,13 @@ class BitReaper
|
|
52
45
|
@noko = self.download(@url)
|
53
46
|
end
|
54
47
|
|
48
|
+
##
|
55
49
|
# Get a new parser from a given parser path
|
56
50
|
#
|
57
51
|
# @param [String] file The path of the `.br` parser file
|
58
52
|
#
|
59
53
|
# @return [SDL4R::Tag] The resulting parser
|
60
|
-
|
54
|
+
|
61
55
|
def self.getParser(file)
|
62
56
|
parserFile = File.read(file)
|
63
57
|
parserFile = parserFile.gsub(/([\w]+)\!\s/,'\1=on')
|
@@ -69,30 +63,20 @@ class BitReaper
|
|
69
63
|
return SDL4R::read(parserFile)
|
70
64
|
end
|
71
65
|
|
72
|
-
|
73
|
-
#---------------------------------------------------------------------------
|
74
|
-
def process
|
75
|
-
printProgress(@url,@index,1)
|
76
|
-
processNode(@noko, @parser, @store)
|
77
|
-
|
78
|
-
printProgress(@url,@index,2)
|
79
|
-
return @store
|
80
|
-
end
|
81
|
-
|
82
|
-
private
|
83
|
-
|
66
|
+
##
|
84
67
|
# Download given URL
|
85
68
|
#
|
86
69
|
# @param [String] url The URL to be downloaded
|
87
70
|
#
|
88
71
|
# @return [Nokogiri::XML::NodeSet] The resulting nodes
|
89
|
-
|
72
|
+
|
90
73
|
def download(url,withProgress=true)
|
91
74
|
printProgress(@url,@index,0) if withProgress
|
92
75
|
|
93
76
|
return Nokogiri::HTML(open(url))
|
94
77
|
end
|
95
78
|
|
79
|
+
##
|
96
80
|
# Process String value using attribute
|
97
81
|
#
|
98
82
|
# @param [String] attrb The attribute to be processed
|
@@ -100,7 +84,7 @@ class BitReaper
|
|
100
84
|
# @param [String] param The attribute's param (if any)
|
101
85
|
#
|
102
86
|
# @return [String,Array] The result of the operation
|
103
|
-
|
87
|
+
|
104
88
|
def processStringValue(attrb,val,param)
|
105
89
|
case attrb
|
106
90
|
when "prepend"
|
@@ -128,6 +112,7 @@ class BitReaper
|
|
128
112
|
return val
|
129
113
|
end
|
130
114
|
|
115
|
+
##
|
131
116
|
# Process Array value using attribute
|
132
117
|
#
|
133
118
|
# @param [String] attrb The attribute to be processed
|
@@ -135,7 +120,7 @@ class BitReaper
|
|
135
120
|
# @param [String] param The attribute's param (if any)
|
136
121
|
#
|
137
122
|
# @return [String,Array] The result of the operation
|
138
|
-
|
123
|
+
|
139
124
|
def processArrayValue(attrb,val,param)
|
140
125
|
case attrb
|
141
126
|
when "join"
|
@@ -162,13 +147,14 @@ class BitReaper
|
|
162
147
|
return val
|
163
148
|
end
|
164
149
|
|
150
|
+
##
|
165
151
|
# Process parsed values using set of attributes
|
166
152
|
#
|
167
153
|
# @param [Array] values The parsed values
|
168
154
|
# @param [Array] attrbs The associated attributes
|
169
155
|
#
|
170
156
|
# @return [String,Array] The result of the operation
|
171
|
-
|
157
|
+
|
172
158
|
def processValues(values,attrbs)
|
173
159
|
# check if we have a single value or an array of values
|
174
160
|
ret = (values.count==1) ? values[0].content
|
@@ -197,13 +183,14 @@ class BitReaper
|
|
197
183
|
return (ret.nil?) ? "" : ret
|
198
184
|
end
|
199
185
|
|
186
|
+
##
|
200
187
|
# Process a given node using provided parser and temporary storage hash
|
201
188
|
#
|
202
189
|
# @param [Nokogiri::XML::node] noko The Nokogiri node to work on
|
203
190
|
# @param [SDL4R::Tag] node The parser node
|
204
191
|
# @param [Hash] store The temporary storage hash
|
205
192
|
# @param [Integer] level The nesting level (for informational purposes)
|
206
|
-
|
193
|
+
|
207
194
|
def processNode(noko,node,store,level=0)
|
208
195
|
node.children.each{|child|
|
209
196
|
command = child.namespace
|
@@ -231,6 +218,17 @@ class BitReaper
|
|
231
218
|
}
|
232
219
|
end
|
233
220
|
|
221
|
+
##
|
222
|
+
# Process current project
|
223
|
+
|
224
|
+
def process
|
225
|
+
printProgress(@url,@index,1)
|
226
|
+
processNode(@noko, @parser, @store)
|
227
|
+
|
228
|
+
printProgress(@url,@index,2)
|
229
|
+
return @store
|
230
|
+
end
|
231
|
+
|
234
232
|
end
|
235
233
|
|
236
234
|
#######################################################
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bitreaper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dr.Kameleon
|
@@ -9,7 +9,119 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
date: 2020-04-09 00:00:00.000000000 Z
|
12
|
-
dependencies:
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: awesome_print
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: colorize
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: down
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: fileutils
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: json
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: liquid
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: nokogiri
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: sdl4r
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
13
125
|
description: Automated Web-Scraping Client for Ruby using SLD2-like configuration
|
14
126
|
files. Supports XPath and CSS selectors via Nokogiri.
|
15
127
|
email: drkameleon@gmail.com
|