ninja2k 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Randy Morgan
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,81 @@
1
+ # Ninja2k
2
+
3
+ This gem is a tool for Ninja2k that allows you to scrape Nokogiri parsable resources for specified
4
+ clues and add hooks to define how those clues are processed. It also
5
+ lets you export the results into an xlsx file.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'ninja2k'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install ninja2k
20
+
21
+ ## Usage
22
+
23
+ ### Basic Scraping
24
+
25
+ require 'ninja2k'
26
+
27
+ clues = ['Operating system', 'Processors', 'Chipset', 'Memory type', 'Hard drive', 'Graphics',
28
+ 'Ports', 'Webcam', 'Pointing device', 'Keyboard', 'Network interface', 'Chipset', 'Wireless',
29
+ 'Power supply type', 'Energy efficiency', 'Weight', 'Minimum dimensions (W x D x H)',
30
+ 'Warranty', 'Software included', 'Product color']
31
+
32
+ url = "http://h10010.www1.hp.com/wwpc/ie/en/ho/WF06b/321957-321957-3329742-89318-89318-5186820-5231694.html?dnr=1"
33
+ selector = "//td[text()='%s']/following-sibling::td"
34
+
35
+ scraper = Ninja2k::Scraper.new(url, selector, :clues => clues)
36
+ scraper.to_xlsx('my_spreadsheet.xlsx')
37
+
38
+
39
+ ### With Hooks and Styles
40
+
41
+ require 'ninja2k'
42
+
43
+ clues = ['Operating system', 'Processors', 'Chipset', 'Memory type', 'Hard drive', 'Graphics',
44
+ 'Ports', 'Webcam', 'Pointing device', 'Keyboard', 'Network interface', 'Chipset', 'Wireless',
45
+ 'Power supply type', 'Energy efficiency', 'Weight', 'Minimum dimensions (W x D x H)',
46
+ 'Warranty', 'Software included', 'Product color']
47
+
48
+ url = "http://h10010.www1.hp.com/wwpc/ie/en/ho/WF06b/321957-321957-3329742-89318-89318-5186820-5231694.html?dnr=1"
49
+ selector = "//td[text()='%s']/following-sibling::td"
50
+
51
+ os_hook = Proc.new do |element|
52
+ element.inner_html.split('<br>').each do |datum|
53
+ datum.strip!.upcase!
54
+ end
55
+ end
56
+
57
+
58
+ scraper = Ninja2k::Scraper.new(url, selector, :clues => clues, :hooks => { 'Operating system' => os_hook })
59
+
60
+ # You can also alter the xlsx spreadsheet before serializing.
61
+ # See https://github.com/randym/axlsx
62
+
63
+ package = scraper.to_xlsx
64
+ clue_style = package.workbook.styles.add_style :fg_color => 'FF0000'
65
+ package.workbook.worksheets.first.col_style(0, clue_style)
66
+ package.serialize('styled.xlsx')
67
+
68
+ ## Contributing
69
+
70
+ 1. Fork it
71
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
72
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
73
+ 4. Push to the branch (`git push origin my-new-feature`)
74
+ 5. Create new Pull Request
75
+
76
+ ## Copyright and License
77
+ ----------
78
+
79
+ Ninja2k &copy; 2012 by [Randy Morgan](mailto:digial.ipseity@gmail.com).
80
+
81
+ Ninja2k is licensed under the MIT license. Please see the LICENSE document for more information.
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ task :test do
4
+ require 'rake/testtask'
5
+ Rake::TestTask.new do |t|
6
+ t.libs << 'test'
7
+ t.test_files = FileList['test/**/tc_*.rb']
8
+ t.verbose = false
9
+ t.warning = true
10
+ end
11
+ end
12
+
13
+ task :default => :test
@@ -0,0 +1,168 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'axlsx'
4
+
5
+ module Ninja2k
6
+
7
+
8
+ # Scraper will load up a specified resource, and search the page using a combination of your seletor and any clues given.
9
+ # It provides a hooking mechanism so you can override the default parsing action (split on <br>, one row for each item found)
10
+ #
11
+ # @example
12
+ # clues = ['Operating system', 'Processors', 'Chipset', 'Memory type', 'Hard drive', 'Graphics',
13
+ # 'Ports', 'Webcam', 'Pointing device', 'Keyboard', 'Network interface', 'Chipset', 'Wireless',
14
+ # 'Power supply type', 'Energy efficiency', 'Weight', 'Minimum dimensions (W x D x H)',
15
+ # 'Warranty', 'Software included', 'Product color']
16
+ #
17
+ # url = "http://h10010.www1.hp.com/wwpc/ie/en/ho/WF06b/321957-321957-3329742-89318-89318-5186820-5231694.html?dnr=1"
18
+ # selector = "//td[text()='%s']/following-sibling::td"
19
+ #
20
+ # scraper = Ninja2k::Scraper.new(url, selector, :clues => clues)
21
+ # scraper.to_xlsx('my_spreadsheet.xlsx')
22
+ class Scraper
23
+
24
+ # Creates a new Scraper
25
+ #
26
+ # @param [String] url The resource to scrape
27
+ #
28
+ # @param [String] selector The xpath select to use when searching for clues. Use %s in the selector to interpolate each clue
29
+ #
30
+ # @param [Hash] options each option will be evaluated against a attr_writer using respond_to? If a writer exists, the value for the option is passed to the writer.
31
+ #
32
+ # @option [Array] clues The clues to search for
33
+ #
34
+ # @option [Hash] hooks A hash of hooks where the key is the clue name the Proc value will be caled against.
35
+ def initialize(url, selector, options={})
36
+ self.url = url
37
+ self.selector = selector
38
+ options.each do |o|
39
+ self.send("#{o[0]}=", o[1]) if self.respond_to? "#{o[0]}="
40
+ end
41
+ end
42
+
43
+ # The url we will scrape from
44
+ # @return [String]
45
+ attr_accessor :url
46
+
47
+ # The xpath selector to use when searching for clues
48
+ # @return [String]
49
+ attr_accessor :selector
50
+
51
+ # The output from scraping as an array
52
+ # This is populated by the scrape or to_xlsx methods
53
+ #
54
+ # @return [Array]
55
+ def output
56
+ @output ||= []
57
+ end
58
+
59
+ # A hash of Proc object to call when parsing each item found by the selector and clue combination.
60
+ # The element found will be passed to the member of this hash that uses the clue as a key
61
+ #
62
+ # @see example/example.rb
63
+ #
64
+ # @return [Hash]
65
+ def hooks
66
+ @hooks ||= {}
67
+ end
68
+
69
+ # @see hooks
70
+ def hooks=(hash)
71
+ raise ArgumentError, 'Hooks must be a hash of procs to call when scraping each clue' unless hash.is_a?(Hash)
72
+ @hooks = hash
73
+ end
74
+
75
+ # Adds a hook to the hook hash
76
+ #
77
+ # @param [String] clue the clue this hook will be called for
78
+ #
79
+ # @param [Proc] p_roc the Proc to call when the clue is found
80
+ def add_hook(clue, p_roc)
81
+ hooks[clue] = p_roc
82
+ end
83
+
84
+ # Scrapes the resourse using the clues and hooks provided
85
+ #
86
+ # @return [Array]
87
+ def scrape
88
+ @package = nil
89
+ @output = []
90
+ clues.each do |clue|
91
+ if detail = parse_clue(clue)
92
+ output << [clue, detail.pop]
93
+ detail.each { |datum| output << ['', datum] }
94
+ end
95
+ end
96
+ output
97
+ end
98
+
99
+ # seralizes the output to xlsx. If you do not specify the file_name parameter
100
+ # The package will be created, but not serialized to disk. This means you can use the return value
101
+ # to stream the data using to_xlsx(false).to_stream.read
102
+ #
103
+ # @param [String] filename the filename to use in output
104
+ #
105
+ # @return [Axlsx::Package]
106
+ def to_xlsx(filename=false)
107
+ scrape
108
+ serialize(filename)
109
+ end
110
+
111
+ # The clues we are going to look for with the selector in the document returned by url
112
+ #
113
+ # @return [Array]
114
+ def clues
115
+ @clues ||= []
116
+ end
117
+
118
+ # Sets the clues for the scraper
119
+ #
120
+ # @param [Arrray] value The clues to look for.
121
+ def clues=(value)
122
+ raise ArugmentError, 'clues must be an array of strings to search for with your selector' unless value.is_a?(Array)
123
+ @clues = value
124
+ end
125
+
126
+ # The axlsx package used for xlsx serialization
127
+ #
128
+ # @return [Axlsx::Package]
129
+ def package
130
+ @package ||= Axlsx::Package.new
131
+ end
132
+
133
+ private
134
+
135
+ def doc
136
+ @doc ||= begin
137
+ Nokogiri::HTML(open(@url))
138
+ rescue
139
+ raise ArgumentError, 'Invalid URL - Nothing to parse'
140
+ end
141
+ end
142
+
143
+ def selector_for_clue(clue)
144
+ @selector % clue
145
+ end
146
+
147
+ def parse_clue(clue)
148
+ if element = doc.at(selector_for_clue(clue))
149
+ call_hook(clue, element) || element.inner_html.split('<br>').each(&:strip!)
150
+ end
151
+ end
152
+
153
+ def call_hook(clue, element)
154
+ if hooks[clue].is_a? Proc
155
+ value = hooks[clue].call(element)
156
+ value.is_a?(Array) ? value : [value]
157
+ end
158
+ end
159
+
160
+ def serialize(file_name)
161
+ package.workbook.add_worksheet do |sheet|
162
+ output.each { |datum| sheet.add_row datum }
163
+ end
164
+ package.serialize(file_name) if file_name
165
+ package
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,3 @@
1
+ module Ninja2k
2
+ VERSION = "0.0.1"
3
+ end
data/lib/ninja2k.rb ADDED
@@ -0,0 +1,2 @@
1
+ require "ninja2k/version"
2
+ require "ninja2k/scraper.rb"
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ninja2k
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Randy Morgan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-08-03 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: axlsx
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: kramdown
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: A web resource scraper that lets you define xpath selectors, clues and
79
+ hooks for custom parsing as well as export to xlsx.
80
+ email:
81
+ - digital.ipseity@gmail.com
82
+ executables: []
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - lib/ninja2k/scraper.rb
87
+ - lib/ninja2k/version.rb
88
+ - lib/ninja2k.rb
89
+ - README.md
90
+ - LICENSE
91
+ - Rakefile
92
+ homepage: https://github.com/randym/ninja2k
93
+ licenses: []
94
+ post_install_message:
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ requirements: []
111
+ rubyforge_project:
112
+ rubygems_version: 1.8.24
113
+ signing_key:
114
+ specification_version: 3
115
+ summary: ninja2k is a gem for ninja2k
116
+ test_files: []
117
+ has_rdoc: