ninja2k 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +22 -0
- data/README.md +81 -0
- data/Rakefile +13 -0
- data/lib/ninja2k/scraper.rb +168 -0
- data/lib/ninja2k/version.rb +3 -0
- data/lib/ninja2k.rb +2 -0
- metadata +117 -0
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Randy Morgan
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
# Ninja2k
|
2
|
+
|
3
|
+
This gem is a tool for Ninja2k that allows you to scrape Nokogiri parsable resources for specified
|
4
|
+
clues and add hooks to define how those clues are processed. It also
|
5
|
+
lets you export the results into an xlsx file.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'ninja2k'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install ninja2k
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
### Basic Scraping
|
24
|
+
|
25
|
+
require 'ninja2k'
|
26
|
+
|
27
|
+
clues = ['Operating system', 'Processors', 'Chipset', 'Memory type', 'Hard drive', 'Graphics',
|
28
|
+
'Ports', 'Webcam', 'Pointing device', 'Keyboard', 'Network interface', 'Chipset', 'Wireless',
|
29
|
+
'Power supply type', 'Energy efficiency', 'Weight', 'Minimum dimensions (W x D x H)',
|
30
|
+
'Warranty', 'Software included', 'Product color']
|
31
|
+
|
32
|
+
url = "http://h10010.www1.hp.com/wwpc/ie/en/ho/WF06b/321957-321957-3329742-89318-89318-5186820-5231694.html?dnr=1"
|
33
|
+
selector = "//td[text()='%s']/following-sibling::td"
|
34
|
+
|
35
|
+
scraper = Ninja2k::Scraper.new(url, selector, :clues => clues)
|
36
|
+
scraper.to_xlsx('my_spreadsheet.xlsx')
|
37
|
+
|
38
|
+
|
39
|
+
### With Hooks and Styles
|
40
|
+
|
41
|
+
require 'ninja2k'
|
42
|
+
|
43
|
+
clues = ['Operating system', 'Processors', 'Chipset', 'Memory type', 'Hard drive', 'Graphics',
|
44
|
+
'Ports', 'Webcam', 'Pointing device', 'Keyboard', 'Network interface', 'Chipset', 'Wireless',
|
45
|
+
'Power supply type', 'Energy efficiency', 'Weight', 'Minimum dimensions (W x D x H)',
|
46
|
+
'Warranty', 'Software included', 'Product color']
|
47
|
+
|
48
|
+
url = "http://h10010.www1.hp.com/wwpc/ie/en/ho/WF06b/321957-321957-3329742-89318-89318-5186820-5231694.html?dnr=1"
|
49
|
+
selector = "//td[text()='%s']/following-sibling::td"
|
50
|
+
|
51
|
+
os_hook = Proc.new do |element|
|
52
|
+
element.inner_html.split('<br>').each do |datum|
|
53
|
+
datum.strip!.upcase!
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
scraper = Ninja2k::Scraper.new(url, selector, :clues => clues, :hooks => { 'Operating system' => os_hook })
|
59
|
+
|
60
|
+
# You can also alter the xlsx spreadsheet before serializing.
|
61
|
+
# See https://github.com/randym/axlsx
|
62
|
+
|
63
|
+
package = scraper.to_xlsx
|
64
|
+
clue_style = package.workbook.styles.add_style :fg_color => 'FF0000'
|
65
|
+
package.workbook.worksheets.first.col_style(0, clue_style)
|
66
|
+
package.serialize('styled.xlsx')
|
67
|
+
|
68
|
+
## Contributing
|
69
|
+
|
70
|
+
1. Fork it
|
71
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
72
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
73
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
74
|
+
5. Create new Pull Request
|
75
|
+
|
76
|
+
## Copyright and License
|
77
|
+
----------
|
78
|
+
|
79
|
+
Ninja2k © 2012 by [Randy Morgan](mailto:digial.ipseity@gmail.com).
|
80
|
+
|
81
|
+
Ninja2k is licensed under the MIT license. Please see the LICENSE document for more information.
|
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
task :test do
|
4
|
+
require 'rake/testtask'
|
5
|
+
Rake::TestTask.new do |t|
|
6
|
+
t.libs << 'test'
|
7
|
+
t.test_files = FileList['test/**/tc_*.rb']
|
8
|
+
t.verbose = false
|
9
|
+
t.warning = true
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
task :default => :test
|
@@ -0,0 +1,168 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'axlsx'
|
4
|
+
|
5
|
+
module Ninja2k
|
6
|
+
|
7
|
+
|
8
|
+
# Scraper will load up a specified resource, and search the page using a combination of your seletor and any clues given.
|
9
|
+
# It provides a hooking mechanism so you can override the default parsing action (split on <br>, one row for each item found)
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# clues = ['Operating system', 'Processors', 'Chipset', 'Memory type', 'Hard drive', 'Graphics',
|
13
|
+
# 'Ports', 'Webcam', 'Pointing device', 'Keyboard', 'Network interface', 'Chipset', 'Wireless',
|
14
|
+
# 'Power supply type', 'Energy efficiency', 'Weight', 'Minimum dimensions (W x D x H)',
|
15
|
+
# 'Warranty', 'Software included', 'Product color']
|
16
|
+
#
|
17
|
+
# url = "http://h10010.www1.hp.com/wwpc/ie/en/ho/WF06b/321957-321957-3329742-89318-89318-5186820-5231694.html?dnr=1"
|
18
|
+
# selector = "//td[text()='%s']/following-sibling::td"
|
19
|
+
#
|
20
|
+
# scraper = Ninja2k::Scraper.new(url, selector, :clues => clues)
|
21
|
+
# scraper.to_xlsx('my_spreadsheet.xlsx')
|
22
|
+
class Scraper
|
23
|
+
|
24
|
+
# Creates a new Scraper
|
25
|
+
#
|
26
|
+
# @param [String] url The resource to scrape
|
27
|
+
#
|
28
|
+
# @param [String] selector The xpath select to use when searching for clues. Use %s in the selector to interpolate each clue
|
29
|
+
#
|
30
|
+
# @param [Hash] options each option will be evaluated against a attr_writer using respond_to? If a writer exists, the value for the option is passed to the writer.
|
31
|
+
#
|
32
|
+
# @option [Array] clues The clues to search for
|
33
|
+
#
|
34
|
+
# @option [Hash] hooks A hash of hooks where the key is the clue name the Proc value will be caled against.
|
35
|
+
def initialize(url, selector, options={})
|
36
|
+
self.url = url
|
37
|
+
self.selector = selector
|
38
|
+
options.each do |o|
|
39
|
+
self.send("#{o[0]}=", o[1]) if self.respond_to? "#{o[0]}="
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# The url we will scrape from
|
44
|
+
# @return [String]
|
45
|
+
attr_accessor :url
|
46
|
+
|
47
|
+
# The xpath selector to use when searching for clues
|
48
|
+
# @return [String]
|
49
|
+
attr_accessor :selector
|
50
|
+
|
51
|
+
# The output from scraping as an array
|
52
|
+
# This is populated by the scrape or to_xlsx methods
|
53
|
+
#
|
54
|
+
# @return [Array]
|
55
|
+
def output
|
56
|
+
@output ||= []
|
57
|
+
end
|
58
|
+
|
59
|
+
# A hash of Proc object to call when parsing each item found by the selector and clue combination.
|
60
|
+
# The element found will be passed to the member of this hash that uses the clue as a key
|
61
|
+
#
|
62
|
+
# @see example/example.rb
|
63
|
+
#
|
64
|
+
# @return [Hash]
|
65
|
+
def hooks
|
66
|
+
@hooks ||= {}
|
67
|
+
end
|
68
|
+
|
69
|
+
# @see hooks
|
70
|
+
def hooks=(hash)
|
71
|
+
raise ArgumentError, 'Hooks must be a hash of procs to call when scraping each clue' unless hash.is_a?(Hash)
|
72
|
+
@hooks = hash
|
73
|
+
end
|
74
|
+
|
75
|
+
# Adds a hook to the hook hash
|
76
|
+
#
|
77
|
+
# @param [String] clue the clue this hook will be called for
|
78
|
+
#
|
79
|
+
# @param [Proc] p_roc the Proc to call when the clue is found
|
80
|
+
def add_hook(clue, p_roc)
|
81
|
+
hooks[clue] = p_roc
|
82
|
+
end
|
83
|
+
|
84
|
+
# Scrapes the resourse using the clues and hooks provided
|
85
|
+
#
|
86
|
+
# @return [Array]
|
87
|
+
def scrape
|
88
|
+
@package = nil
|
89
|
+
@output = []
|
90
|
+
clues.each do |clue|
|
91
|
+
if detail = parse_clue(clue)
|
92
|
+
output << [clue, detail.pop]
|
93
|
+
detail.each { |datum| output << ['', datum] }
|
94
|
+
end
|
95
|
+
end
|
96
|
+
output
|
97
|
+
end
|
98
|
+
|
99
|
+
# seralizes the output to xlsx. If you do not specify the file_name parameter
|
100
|
+
# The package will be created, but not serialized to disk. This means you can use the return value
|
101
|
+
# to stream the data using to_xlsx(false).to_stream.read
|
102
|
+
#
|
103
|
+
# @param [String] filename the filename to use in output
|
104
|
+
#
|
105
|
+
# @return [Axlsx::Package]
|
106
|
+
def to_xlsx(filename=false)
|
107
|
+
scrape
|
108
|
+
serialize(filename)
|
109
|
+
end
|
110
|
+
|
111
|
+
# The clues we are going to look for with the selector in the document returned by url
|
112
|
+
#
|
113
|
+
# @return [Array]
|
114
|
+
def clues
|
115
|
+
@clues ||= []
|
116
|
+
end
|
117
|
+
|
118
|
+
# Sets the clues for the scraper
|
119
|
+
#
|
120
|
+
# @param [Arrray] value The clues to look for.
|
121
|
+
def clues=(value)
|
122
|
+
raise ArugmentError, 'clues must be an array of strings to search for with your selector' unless value.is_a?(Array)
|
123
|
+
@clues = value
|
124
|
+
end
|
125
|
+
|
126
|
+
# The axlsx package used for xlsx serialization
|
127
|
+
#
|
128
|
+
# @return [Axlsx::Package]
|
129
|
+
def package
|
130
|
+
@package ||= Axlsx::Package.new
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
def doc
|
136
|
+
@doc ||= begin
|
137
|
+
Nokogiri::HTML(open(@url))
|
138
|
+
rescue
|
139
|
+
raise ArgumentError, 'Invalid URL - Nothing to parse'
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def selector_for_clue(clue)
|
144
|
+
@selector % clue
|
145
|
+
end
|
146
|
+
|
147
|
+
def parse_clue(clue)
|
148
|
+
if element = doc.at(selector_for_clue(clue))
|
149
|
+
call_hook(clue, element) || element.inner_html.split('<br>').each(&:strip!)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def call_hook(clue, element)
|
154
|
+
if hooks[clue].is_a? Proc
|
155
|
+
value = hooks[clue].call(element)
|
156
|
+
value.is_a?(Array) ? value : [value]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def serialize(file_name)
|
161
|
+
package.workbook.add_worksheet do |sheet|
|
162
|
+
output.each { |datum| sheet.add_row datum }
|
163
|
+
end
|
164
|
+
package.serialize(file_name) if file_name
|
165
|
+
package
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
data/lib/ninja2k.rb
ADDED
metadata
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ninja2k
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Randy Morgan
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-08-03 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: axlsx
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: kramdown
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: yard
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: A web resource scraper that lets you define xpath selectors, clues and
|
79
|
+
hooks for custom parsing as well as export to xlsx.
|
80
|
+
email:
|
81
|
+
- digital.ipseity@gmail.com
|
82
|
+
executables: []
|
83
|
+
extensions: []
|
84
|
+
extra_rdoc_files: []
|
85
|
+
files:
|
86
|
+
- lib/ninja2k/scraper.rb
|
87
|
+
- lib/ninja2k/version.rb
|
88
|
+
- lib/ninja2k.rb
|
89
|
+
- README.md
|
90
|
+
- LICENSE
|
91
|
+
- Rakefile
|
92
|
+
homepage: https://github.com/randym/ninja2k
|
93
|
+
licenses: []
|
94
|
+
post_install_message:
|
95
|
+
rdoc_options: []
|
96
|
+
require_paths:
|
97
|
+
- lib
|
98
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
requirements: []
|
111
|
+
rubyforge_project:
|
112
|
+
rubygems_version: 1.8.24
|
113
|
+
signing_key:
|
114
|
+
specification_version: 3
|
115
|
+
summary: ninja2k is a gem for ninja2k
|
116
|
+
test_files: []
|
117
|
+
has_rdoc:
|