web_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5460b12d5b804540dbe145c157897d92eeaa68ea
4
+ data.tar.gz: 79d9afacf57e5424b808cc90fc7e1fafa9549170
5
+ SHA512:
6
+ metadata.gz: cabaeb1fe86d0e180642245630176bbcec94a797cf7a53fe036baf89396dfbc701db93f522fb793d2796902dc1ff676fc7a7f26a0a689a5f61499f55d5531a86
7
+ data.tar.gz: 1f8209b5ab0164d5759dab4bd70dac5ccb51af5d63906c506acd386f50ed3d3fc22457f178dc0110df28fd9a669bd37f3a888f335af093386428899bc4bb3709
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in web_parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Daniel Mrozek
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,116 @@
1
+ # WebParser
2
+
3
+ Simple gem for easy information fetching from web pages.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'web_parser'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install web_parser
18
+
19
+ ## Example usage
20
+
21
+ Just write your own class and include WebParser::Doc.
22
+
23
+ ```ruby
24
+ class YahooSearch
25
+ # include WebParser
26
+ include WebParser::Doc
27
+
28
+ # define recipes
29
+ recipes do
30
+ # simplest way to define recipe
31
+ query :xpath, '//input[@id="yschsp"]/@value'
32
+ # You can use simple normalization as last parameter
33
+ query_downcase :xpath, '//input[@id="yschsp"]/@value',
34
+ ->(value) { value.text.downcase }
35
+ # You can also provide just method name for normalization
36
+ page_number :css, '#pg > strong', :to_i
37
+ # Or you can do whatever you want to obtain value, just provide lambda as
38
+ # parameter
39
+ first_page? :lambda, ->(doc) {
40
+ doc.css('#pg > strong').text.to_i == 1
41
+ }
42
+ # Nesting
43
+ right_links do
44
+ sign_in :css, '#yucs-profile', :strip
45
+ mail :css, '#yucs-mail_link_id', :strip
46
+ end
47
+ # Array, usefull for example when parsing eshops
48
+ results :css, '#web > ol > li' do
49
+ name :css, '> .res h3'
50
+ url :xpath, './/h3[1]/a/@href'
51
+ end
52
+ end
53
+ end
54
+ ```
55
+
56
+ Then just initialize your class and call `parse`.
57
+
58
+ ```ruby
59
+ require 'open-uri'
60
+
61
+ html_page = open('http://search.yahoo.com/search?p=Ruby').read
62
+
63
+ YahooSearch.new(html_page).parse
64
+ => {:query=>"Ruby",
65
+ :query_downcase=>"ruby",
66
+ :page_number=>1,
67
+ :first_page?=>true,
68
+ :right_links=>{:sign_in=>"Sign In", :mail=>"Mail"},
69
+ :results=>
70
+ [
71
+ {
72
+ :name=>"Ruby Programming Language",
73
+ :url=>"https://www.ruby-lang.org/en/" },
74
+ {
75
+ :name=>"Ruby - Wikipedia, the free encyclopedia",
76
+ :url=>"http://en.wikipedia.org/wiki/Ruby"},
77
+ {
78
+ :name=>"Ruby - Image Results",
79
+ :url=>"http://images.search.yahoo.com/search/images?_adv_prop=image&va=Ruby"},
80
+ {
81
+ :name=>"Ruby (programming language) - Wikipedia, the free
82
+ encyclopedia",
83
+ :url=>"http://en.wikipedia.org/wiki/Ruby_(programming_language)"},
84
+ {
85
+ :name=>"‘Ruby’ Today: Reality Star Dishes on Show’s Failure ...",
86
+ :url=>"http://abcnews.go.com/blogs/entertainment/2013/01/ruby-today-reality-star-dishes-on-shows-failure/"},
87
+ {
88
+ :name=>"Download Ruby",
89
+ :url=>"https://www.ruby-lang.org/en/downloads/"},
90
+ {
91
+ :name=>"Ruby: The gemstone Ruby information and pictures",
92
+ :url=>"http://www.minerals.net/gemstone/ruby_gemstone.aspx"},
93
+ {
94
+ :name=>"Ruby - Gemstone",
95
+ :url=>"http://www.gemstone.org/index.php?option=com_content&view=article&id=85:ruby&catid=1:gem-by-gem&Itemid=14"},
96
+ {
97
+ :name=>"Buy Loose Precious Ruby Gemstones at Wholesale Prices from ...",
98
+ :url=>"http://www.gemselect.com/ruby/ruby.php"},
99
+ {
100
+ :name=>"Ruby on Rails",
101
+ :url=>"http://rubyonrails.org/"},
102
+ {
103
+ :name=>"Ruby (Adventures) - Bulbapedia, the community-driven Pokémon ...",
104
+ :url=>"http://bulbapedia.bulbagarden.net/wiki/Ruby_(Adventures)"}
105
+ ]
106
+ }
107
+
108
+ ```
109
+
110
+ ## Contributing
111
+
112
+ 1. Fork it
113
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
114
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
115
+ 4. Push to the branch (`git push origin my-new-feature`)
116
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+
6
+ require './lib/web_parser.rb'
7
+
8
+ task :console do
9
+ require 'pry'
10
+ ARGV.clear
11
+ Pry.start WebParser
12
+ end
13
+ task :c => :console
14
+
@@ -0,0 +1,43 @@
1
+ require 'open-uri'
2
+ $LOAD_PATH.unshift "#{File.dirname(__FILE__)}/../lib"
3
+ require 'web_parser'
4
+
5
+ class YahooSearch
6
+ # include WebParser
7
+ include WebParser::Doc
8
+
9
+ # define recipes
10
+ recipes do
11
+ # simplest way to define recipe
12
+ query :xpath, '//input[@id="yschsp"]/@value'
13
+ # You can use simple normalization as last parameter
14
+ query_downcase :xpath, '//input[@id="yschsp"]/@value',
15
+ ->(value) { value.text.downcase }
16
+ # You can also provide just method name for normalization
17
+ page_number :css, '#pg > strong', :to_i
18
+ # Or you can do whatever you want to obtain value, just provide lambda as
19
+ # parameter
20
+ first_page? :lambda, ->(doc) {
21
+ doc.css('#pg > strong').text.to_i == 1
22
+ }
23
+ # Nesting
24
+ right_links do
25
+ sign_in :css, '#yucs-profile', :strip
26
+ mail :css, '#yucs-mail_link_id', :strip
27
+ end
28
+ # Array, usefull for example when parsing eshops
29
+ results :css, '#web > ol > li' do
30
+ name :css, '> .res h3'
31
+ url :xpath, './/h3[1]/a/@href'
32
+ end
33
+ end
34
+ end
35
+
36
+ html_page = open('http://search.yahoo.com/search?p=Ruby').read
37
+
38
+ rslt = YahooSearch.new(html_page).parse
39
+
40
+ require 'pp'
41
+
42
+ pp rslt
43
+
@@ -0,0 +1,58 @@
1
+ module WebParser
2
+ module Doc
3
+ class XPathsNotSet < StandardError; end
4
+
5
+ def self.included base
6
+ base.extend ClassMethods
7
+ end
8
+
9
+ module ClassMethods
10
+ def recipes &block
11
+ @recipes = Recipes.new(&block) if block_given?
12
+ @recipes
13
+ end
14
+ end
15
+
16
+ # @example
17
+ # recipes do
18
+ # id :css, '.title > h1'
19
+ # description :xpath, '.desc > p', ->(value) { value.gsub('-', '') }
20
+ # price :xpath, '.price > .vat', :to_f
21
+ # summary :lambda, ->(doc) {
22
+ # doc.css('a#total_downloads').gsub(',', '').to_f
23
+ # }
24
+ # additional_info do
25
+ # vat :css, '.vat', :to_i
26
+ # fee :css, '.fee', :to_f
27
+ # end
28
+ # categories :css, '.categories > .category', :to_s do
29
+ # name :xpath, './li/a'
30
+ # url :xpath, './li/a/@href'
31
+ # count :lambda, ->(doc) {
32
+ # doc.xpath('./li/a').text =~ /(\d+)$/ && $1
33
+ # }
34
+ # end
35
+ # end
36
+
37
+ # Creates a new page parser
38
+ # @param [String] doc Nokogiri object with page we would parsing
39
+ def initialize doc, parser=Nokogiri::HTML
40
+ @doc = parser.parse(doc)
41
+ raise XPathsNotSet, "no recipes defined!" unless self.class.recipes
42
+ end
43
+
44
+ # Main method for parsing document
45
+ # @return [Hash] Parsed informations from page in a hash
46
+ def parse
47
+ get_parsed
48
+ end
49
+
50
+ private
51
+
52
+ def get_parsed
53
+ self.class.recipes.apply(@doc)
54
+ end
55
+ end
56
+
57
+ end
58
+
@@ -0,0 +1,32 @@
1
+ module WebParser
2
+ class Recipe
3
+ attr_reader :type, :value, :normalize_method
4
+
5
+ def initialize name, *args
6
+ @name, @type, @value, @normalize_method = name, *args
7
+ @value, @type = @type, @value if args.size == 1
8
+ end
9
+
10
+ def apply doc
11
+ case @type
12
+ when :val then @value
13
+ when :css then normalize(doc.css(@value))
14
+ when :xpath then normalize(doc.xpath(@value))
15
+ when :lambda then @value.call(doc)
16
+ else raise "uknown recipe type '#{@type}'!"
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ def normalize value
23
+ if @normalize_method.respond_to?(:call)
24
+ @normalize_method.call(value)
25
+ else
26
+ value = value.text.gsub("\u00a0", ' ') # replace nbsp with normal space
27
+ @normalize_method ? value.send(@normalize_method) : value
28
+ end
29
+ end
30
+ end
31
+ end
32
+
@@ -0,0 +1,50 @@
1
+ module WebParser
2
+ class Recipes
3
+ attr_reader :recipes, :type, :recipe
4
+
5
+ def initialize type=:single, recipe=nil, &block
6
+ raise "recipe must be set for type :each" if type == :each && !recipe
7
+ @recipes, @type, @recipe = {}, type, recipe
8
+ instance_eval(&block) if block_given?
9
+ end
10
+
11
+ def apply doc
12
+ recipes.inject({}) do |mem, (name, val)|
13
+ if val.is_a?(Recipe)
14
+ mem[name] = apply_recipe(doc, val)
15
+ elsif val.is_a?(Recipes)
16
+ mem[name] = apply_recipes(doc, val)
17
+ end
18
+ mem
19
+ end
20
+ end
21
+
22
+ def method_missing name, *args, &block
23
+ if block_given? && args.size == 0
24
+ @recipes[name] = Recipes.new(&block)
25
+ elsif block_given? && (args.size == 2 || args.size == 3)
26
+ args[2] ||= -> (val) { val } # just return array of elements
27
+ @recipes[name] = Recipes.new(:each, Recipe.new(name, *args), &block)
28
+ elsif !block_given? && (args.size == 2 || args.size == 3)
29
+ @recipes[name] = Recipe.new(name, *args, &block)
30
+ else
31
+ super
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def apply_recipe doc, recipe
38
+ recipe.apply(doc)
39
+ end
40
+
41
+ def apply_recipes doc, recipes
42
+ if recipes.type == :each
43
+ recipes.recipe.apply(doc).map {|subdoc| recipes.apply(subdoc) }
44
+ else
45
+ recipes.apply(doc)
46
+ end
47
+ end
48
+ end
49
+ end
50
+
@@ -0,0 +1,3 @@
1
+ module WebParser
2
+ VERSION = "0.0.1"
3
+ end
data/lib/web_parser.rb ADDED
@@ -0,0 +1,11 @@
1
+ require "web_parser/version"
2
+
3
+ require 'nokogiri'
4
+
5
+ require_relative 'web_parser/recipe'
6
+ require_relative 'web_parser/recipes'
7
+ require_relative 'web_parser/doc'
8
+
9
+ module WebParser
10
+ end
11
+
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'web_parser/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "web_parser"
8
+ spec.version = WebParser::VERSION
9
+ spec.authors = ["Daniel Mrozek"]
10
+ spec.email = ["mrazicz@gmail.com"]
11
+ spec.description = %q{Simple gem for easy web page parsing.}
12
+ spec.summary = %q{Simple gem for easy web page parsing. Just set xpaths, provide HTML, and get hash with informations.}
13
+ spec.homepage = "https://github.com/mrazicz/web_parser"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "nokogiri", "~> 1.6.3.1"
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.3"
24
+ spec.add_development_dependency "pry"
25
+ spec.add_development_dependency "rake"
26
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Daniel Mrozek
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-08-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 1.6.3.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.6.3.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Simple gem for easy web page parsing.
70
+ email:
71
+ - mrazicz@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - .gitignore
77
+ - Gemfile
78
+ - LICENSE.txt
79
+ - README.md
80
+ - Rakefile
81
+ - examples/yahoo_search.rb
82
+ - lib/web_parser.rb
83
+ - lib/web_parser/doc.rb
84
+ - lib/web_parser/recipe.rb
85
+ - lib/web_parser/recipes.rb
86
+ - lib/web_parser/version.rb
87
+ - web_parser.gemspec
88
+ homepage: https://github.com/mrazicz/web_parser
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - '>='
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - '>='
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 2.1.4
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: Simple gem for easy web page parsing. Just set xpaths, provide HTML, and
112
+ get hash with informations.
113
+ test_files: []
114
+ has_rdoc: