ninja2k 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Randy Morgan
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,81 @@
1
+ # Ninja2k
2
+
3
+ This gem is a tool for Ninja2k that allows you to scrape Nokogiri parsable resources for specified
4
+ clues and add hooks to define how those clues are processed. It also
5
+ lets you export the results into an xlsx file.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'ninja2k'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install ninja2k
20
+
21
+ ## Usage
22
+
23
+ ### Basic Scraping
24
+
25
+ require 'ninja2k'
26
+
27
+ clues = ['Operating system', 'Processors', 'Chipset', 'Memory type', 'Hard drive', 'Graphics',
28
+ 'Ports', 'Webcam', 'Pointing device', 'Keyboard', 'Network interface', 'Chipset', 'Wireless',
29
+ 'Power supply type', 'Energy efficiency', 'Weight', 'Minimum dimensions (W x D x H)',
30
+ 'Warranty', 'Software included', 'Product color']
31
+
32
+ url = "http://h10010.www1.hp.com/wwpc/ie/en/ho/WF06b/321957-321957-3329742-89318-89318-5186820-5231694.html?dnr=1"
33
+ selector = "//td[text()='%s']/following-sibling::td"
34
+
35
+ scraper = Ninja2k::Scraper.new(url, selector, :clues => clues)
36
+ scraper.to_xlsx('my_spreadsheet.xlsx')
37
+
38
+
39
+ ### With Hooks and Styles
40
+
41
+ require 'ninja2k'
42
+
43
+ clues = ['Operating system', 'Processors', 'Chipset', 'Memory type', 'Hard drive', 'Graphics',
44
+ 'Ports', 'Webcam', 'Pointing device', 'Keyboard', 'Network interface', 'Chipset', 'Wireless',
45
+ 'Power supply type', 'Energy efficiency', 'Weight', 'Minimum dimensions (W x D x H)',
46
+ 'Warranty', 'Software included', 'Product color']
47
+
48
+ url = "http://h10010.www1.hp.com/wwpc/ie/en/ho/WF06b/321957-321957-3329742-89318-89318-5186820-5231694.html?dnr=1"
49
+ selector = "//td[text()='%s']/following-sibling::td"
50
+
51
+ os_hook = Proc.new do |element|
52
+ element.inner_html.split('<br>').each do |datum|
53
+ datum.strip!.upcase!
54
+ end
55
+ end
56
+
57
+
58
+ scraper = Ninja2k::Scraper.new(url, selector, :clues => clues, :hooks => { 'Operating system' => os_hook })
59
+
60
+ # You can also alter the xlsx spreadsheet before serializing.
61
+ # See https://github.com/randym/axlsx
62
+
63
+ package = scraper.to_xlsx
64
+ clue_style = package.workbook.styles.add_style :fg_color => 'FF0000'
65
+ package.workbook.worksheets.first.col_style(0, clue_style)
66
+ package.serialize('styled.xlsx')
67
+
68
+ ## Contributing
69
+
70
+ 1. Fork it
71
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
72
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
73
+ 4. Push to the branch (`git push origin my-new-feature`)
74
+ 5. Create new Pull Request
75
+
76
+ ## Copyright and License
77
+ ----------
78
+
79
+ Ninja2k &copy; 2012 by [Randy Morgan](mailto:digial.ipseity@gmail.com).
80
+
81
+ Ninja2k is licensed under the MIT license. Please see the LICENSE document for more information.
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ task :test do
4
+ require 'rake/testtask'
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ t.test_files = FileList['test/**/tc_*.rb']
8
+ t.verbose = false
9
+ t.warning = true
10
+ end
11
+ end
12
+
13
+ task :default => :test
@@ -0,0 +1,168 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'axlsx'
4
+
5
+ module Ninja2k
6
+
7
+
8
+ # Scraper will load up a specified resource, and search the page using a combination of your seletor and any clues given.
9
+ # It provides a hooking mechanism so you can override the default parsing action (split on <br>, one row for each item found)
10
+ #
11
+ # @example
12
+ # clues = ['Operating system', 'Processors', 'Chipset', 'Memory type', 'Hard drive', 'Graphics',
13
+ # 'Ports', 'Webcam', 'Pointing device', 'Keyboard', 'Network interface', 'Chipset', 'Wireless',
14
+ # 'Power supply type', 'Energy efficiency', 'Weight', 'Minimum dimensions (W x D x H)',
15
+ # 'Warranty', 'Software included', 'Product color']
16
+ #
17
+ # url = "http://h10010.www1.hp.com/wwpc/ie/en/ho/WF06b/321957-321957-3329742-89318-89318-5186820-5231694.html?dnr=1"
18
+ # selector = "//td[text()='%s']/following-sibling::td"
19
+ #
20
+ # scraper = Ninja2k::Scraper.new(url, selector, :clues => clues)
21
+ # scraper.to_xlsx('my_spreadsheet.xlsx')
22
+ class Scraper
23
+
24
+ # Creates a new Scraper
25
+ #
26
+ # @param [String] url The resource to scrape
27
+ #
28
+ # @param [String] selector The xpath select to use when searching for clues. Use %s in the selector to interpolate each clue
29
+ #
30
+ # @param [Hash] options each option will be evaluated against a attr_writer using respond_to? If a writer exists, the value for the option is passed to the writer.
31
+ #
32
+ # @option [Array] clues The clues to search for
33
+ #
34
+ # @option [Hash] hooks A hash of hooks where the key is the clue name the Proc value will be caled against.
35
+ def initialize(url, selector, options={})
36
+ self.url = url
37
+ self.selector = selector
38
+ options.each do |o|
39
+ self.send("#{o[0]}=", o[1]) if self.respond_to? "#{o[0]}="
40
+ end
41
+ end
42
+
43
+ # The url we will scrape from
44
+ # @return [String]
45
+ attr_accessor :url
46
+
47
+ # The xpath selector to use when searching for clues
48
+ # @return [String]
49
+ attr_accessor :selector
50
+
51
+ # The output from scraping as an array
52
+ # This is populated by the scrape or to_xlsx methods
53
+ #
54
+ # @return [Array]
55
+ def output
56
+ @output ||= []
57
+ end
58
+
59
+ # A hash of Proc object to call when parsing each item found by the selector and clue combination.
60
+ # The element found will be passed to the member of this hash that uses the clue as a key
61
+ #
62
+ # @see example/example.rb
63
+ #
64
+ # @return [Hash]
65
+ def hooks
66
+ @hooks ||= {}
67
+ end
68
+
69
+ # @see hooks
70
+ def hooks=(hash)
71
+ raise ArgumentError, 'Hooks must be a hash of procs to call when scraping each clue' unless hash.is_a?(Hash)
72
+ @hooks = hash
73
+ end
74
+
75
+ # Adds a hook to the hook hash
76
+ #
77
+ # @param [String] clue the clue this hook will be called for
78
+ #
79
+ # @param [Proc] p_roc the Proc to call when the clue is found
80
+ def add_hook(clue, p_roc)
81
+ hooks[clue] = p_roc
82
+ end
83
+
84
+ # Scrapes the resourse using the clues and hooks provided
85
+ #
86
+ # @return [Array]
87
+ def scrape
88
+ @package = nil
89
+ @output = []
90
+ clues.each do |clue|
91
+ if detail = parse_clue(clue)
92
+ output << [clue, detail.pop]
93
+ detail.each { |datum| output << ['', datum] }
94
+ end
95
+ end
96
+ output
97
+ end
98
+
99
+ # seralizes the output to xlsx. If you do not specify the file_name parameter
100
+ # The package will be created, but not serialized to disk. This means you can use the return value
101
+ # to stream the data using to_xlsx(false).to_stream.read
102
+ #
103
+ # @param [String] filename the filename to use in output
104
+ #
105
+ # @return [Axlsx::Package]
106
+ def to_xlsx(filename=false)
107
+ scrape
108
+ serialize(filename)
109
+ end
110
+
111
+ # The clues we are going to look for with the selector in the document returned by url
112
+ #
113
+ # @return [Array]
114
+ def clues
115
+ @clues ||= []
116
+ end
117
+
118
+ # Sets the clues for the scraper
119
+ #
120
+ # @param [Arrray] value The clues to look for.
121
+ def clues=(value)
122
+ raise ArugmentError, 'clues must be an array of strings to search for with your selector' unless value.is_a?(Array)
123
+ @clues = value
124
+ end
125
+
126
+ # The axlsx package used for xlsx serialization
127
+ #
128
+ # @return [Axlsx::Package]
129
+ def package
130
+ @package ||= Axlsx::Package.new
131
+ end
132
+
133
+ private
134
+
135
+ def doc
136
+ @doc ||= begin
137
+ Nokogiri::HTML(open(@url))
138
+ rescue
139
+ raise ArgumentError, 'Invalid URL - Nothing to parse'
140
+ end
141
+ end
142
+
143
+ def selector_for_clue(clue)
144
+ @selector % clue
145
+ end
146
+
147
+ def parse_clue(clue)
148
+ if element = doc.at(selector_for_clue(clue))
149
+ call_hook(clue, element) || element.inner_html.split('<br>').each(&:strip!)
150
+ end
151
+ end
152
+
153
+ def call_hook(clue, element)
154
+ if hooks[clue].is_a? Proc
155
+ value = hooks[clue].call(element)
156
+ value.is_a?(Array) ? value : [value]
157
+ end
158
+ end
159
+
160
+ def serialize(file_name)
161
+ package.workbook.add_worksheet do |sheet|
162
+ output.each { |datum| sheet.add_row datum }
163
+ end
164
+ package.serialize(file_name) if file_name
165
+ package
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,3 @@
1
+ module Ninja2k
2
+ VERSION = "0.0.1"
3
+ end
data/lib/ninja2k.rb ADDED
@@ -0,0 +1,2 @@
1
+ require "ninja2k/version"
2
+ require "ninja2k/scraper.rb"
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ninja2k
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Randy Morgan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-08-03 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: axlsx
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: kramdown
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: A web resource scraper that lets you define xpath selectors, clues and
79
+ hooks for custom parsing as well as export to xlsx.
80
+ email:
81
+ - digital.ipseity@gmail.com
82
+ executables: []
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - lib/ninja2k/scraper.rb
87
+ - lib/ninja2k/version.rb
88
+ - lib/ninja2k.rb
89
+ - README.md
90
+ - LICENSE
91
+ - Rakefile
92
+ homepage: https://github.com/randym/ninja2k
93
+ licenses: []
94
+ post_install_message:
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ requirements: []
111
+ rubyforge_project:
112
+ rubygems_version: 1.8.24
113
+ signing_key:
114
+ specification_version: 3
115
+ summary: ninja2k is a gem for ninja2k
116
+ test_files: []
117
+ has_rdoc: