html_mapper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5390c860b698088d4b03d64fe99bcce721db9c0a
4
+ data.tar.gz: 15a7d0754a49f32cc1d43fa39e92b0ec02f34cc7
5
+ SHA512:
6
+ metadata.gz: 12929e4a99e7da410537d9eceead54070b2930b993a19de71859bf547643ddeba9fcf36186e4270a294789db80ea1249f61b441137c85be4310fdf2a8613c72b
7
+ data.tar.gz: 8951384b53115fdaffeb6d737902b1d241c61c213d60aa6c26227889a3115cde9b0d8c7f16e005578f0275376eb057b7368a14dba9693ba0ae7c89574a431726
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ /examples/*.html
11
+ /examples/ignore/
12
+ /extra/
13
+ *.swp
14
+ t.rb
15
+ /tasks/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1 @@
1
+ 2.2.0
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.5
@@ -0,0 +1,11 @@
1
+ --title 'HTML Mapper'
2
+ --output-dir api-docs
3
+ --template-path doc-src/templates
4
+ --markup markdown
5
+ --markup-provider rdiscount
6
+ --hide-api private
7
+ --plugin sitemap
8
+ --asset doc-src/images:images
9
+ -e doc-src/plugins/apis.rb
10
+ -e doc-src/plugins/resources.rb
11
+ lib/**/*.rb
@@ -0,0 +1,13 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
4
+
5
+ We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, age, or religion.
6
+
7
+ Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct.
8
+
9
+ Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team.
10
+
11
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.
12
+
13
+ This Code of Conduct is adapted from the [Contributor Covenant](http:contributor-covenant.org), version 1.0.0, available at [http://contributor-covenant.org/version/1/0/0/](http://contributor-covenant.org/version/1/0/0/)
data/Gemfile ADDED
@@ -0,0 +1,26 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in html_mapper.gemspec
4
+ gemspec
5
+
6
+ group :docs do
7
+ gem 'yard', :git => 'https://github.com/trevorrowe/yard.git', branch: 'frameless'
8
+ gem 'yard-sitemap', '~> 1.0'
9
+ gem 'rdiscount'
10
+
11
+ gem 'nanoc' # guide
12
+
13
+ # guide - syntax highlight
14
+ gem 'nokogiri'
15
+ gem 'coderay'
16
+
17
+ # guide - local preview
18
+ gem 'adsf' # a dead simple fileserver
19
+ gem 'guard-nanoc'
20
+ end
21
+
22
+ group :repl do
23
+ gem 'pry'
24
+ end
25
+
26
+ #require './debug.rb'
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Jiren Patel[jirenpatel@gmail.com]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ # HtmlMapper
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/html_mapper`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'html_mapper'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install html_mapper
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it ( https://github.com/[my-github-username]/html_mapper/fork )
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create a new Pull Request
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'html_mapper'
4
+
5
+ Dir.glob('**/*.rake').each do |task_file|
6
+ load task_file
7
+ end
8
+
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "html_mapper"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'html_mapper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "html_mapper"
8
+ spec.version = HtmlMapper::VERSION
9
+ spec.authors = ["Jiren"]
10
+ spec.email = ["jirenpatel@gmail.com"]
11
+
12
+ if spec.respond_to?(:metadata)
13
+ #spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com' to prevent pushes to rubygems.org, or delete to allow pushes to any server."
14
+ end
15
+
16
+ spec.summary = %q{HTML to ruby object or hash}
17
+ spec.description = %q{Parse html and map to ruby object or hash}
18
+ spec.homepage = "https://github.com/jiren/html_mapper"
19
+ spec.license = "MIT"
20
+
21
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features|examples|tasks)/}) }
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib"]
24
+
25
+ spec.add_development_dependency 'bundler', '~> 1.10.4'
26
+ spec.add_development_dependency 'rake', '~> 10.0'
27
+ spec.add_development_dependency 'rspec'
28
+ spec.add_dependency 'nokogiri', '>= 1.5.5'
29
+ spec.add_dependency 'rest-client'
30
+ end
@@ -0,0 +1,47 @@
1
+ require 'rails/generators/named_base'
2
+
3
+ module HtmlMapper
4
+ module Generators # :nodoc:
5
+ class ScraperGenerator < ::Rails::Generators::NamedBase # :nodoc:
6
+ desc 'This generator creates a HtmlMapper Scraper in app/scrapers and a corresponding test'
7
+
8
+ check_class_collision suffix: 'Scraper'
9
+
10
+ def self.default_generator_root
11
+ File.dirname(__FILE__)
12
+ end
13
+
14
+ def create_scraper_file
15
+ template 'scraper.rb.erb', File.join('app/scrapers', class_path, "#{file_name}_scraper.rb")
16
+ end
17
+
18
+ def create_test_file
19
+ if defined?(RSpec)
20
+ create_scraper_spec
21
+ else
22
+ create_scraper_test
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def create_scraper_spec
29
+ template_file = File.join(
30
+ 'spec/scrapers',
31
+ class_path,
32
+ "#{file_name}_scraper_spec.rb"
33
+ )
34
+ template 'scraper_spec.rb.erb', template_file
35
+ end
36
+
37
+ def create_scraper_test
38
+ template_file = File.join(
39
+ 'test/scrapers',
40
+ class_path,
41
+ "#{file_name}_scraper_test.rb"
42
+ )
43
+ template 'scraper_test.rb.erb', template_file
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,10 @@
1
+ <% module_namespacing do -%>
2
+ class <%= class_name %>Scraper
3
+ include HtmlMapper
4
+
5
+ # collection :test, '.test' do
6
+ # field :name, '.name'
7
+ # end
8
+
9
+ end
10
+ <% end -%>
@@ -0,0 +1,6 @@
1
+ require 'rails_helper'
2
+ <% module_namespacing do -%>
3
+ RSpec.describe <%= class_name %>Scraper, :type => :scraper do
4
+ pending "add some examples to (or delete) #{__FILE__}"
5
+ end
6
+ <% end -%>
@@ -0,0 +1,8 @@
1
+ require_relative 'test_helper'
2
+ <% module_namespacing do -%>
3
+ class <%= class_name %>ScraperTest < MiniTest::Unit::TestCase
4
+ def test_example
5
+ skip "add some examples to (or delete) #{__FILE__}"
6
+ end
7
+ end
8
+ <% end -%>
@@ -0,0 +1,172 @@
1
+ require 'nokogiri'
2
+ require 'rest_client'
3
+ require 'date'
4
+ require 'time'
5
+ require 'json'
6
+
7
+ require 'html_mapper/version'
8
+ require 'html_mapper/parsers'
9
+ require 'html_mapper/supported_types'
10
+ require 'html_mapper/collection'
11
+ require 'html_mapper/relation'
12
+ require 'html_mapper/field'
13
+ require 'html_mapper/object_helper'
14
+ require 'html_mapper/result'
15
+ require 'html_mapper/mapper_exporter'
16
+
17
+ module HtmlMapper
18
+ class NotFoundError < StandardError; end
19
+
20
+ def self.included(base)
21
+ base.instance_eval do
22
+ @collections = {}
23
+ end
24
+
25
+ base.extend ClassMethods
26
+ base.send :include, InstanceMethods
27
+ base.send :include, ObjectHelper
28
+ end
29
+
30
+ module ModuleMethods
31
+ # It select parser based on given url and parse page.
32
+ #
33
+ # HtmlMapper.parse('http://www.imdb.com/search/title?count=100', HTML content of given url)
34
+ #
35
+ def parse(url, html)
36
+ parsers = Parsers.get(url)
37
+
38
+ if parsers
39
+ parsers.map { |klass| klass.parse(Nokogiri::HTML.parse(html), url) }
40
+ else
41
+ fail NotFoundError, "No parser found for #{url}"
42
+ end
43
+ end
44
+
45
+ #
46
+ # @params [RestClient] http_client
47
+ # Set other http client like httparty etc
48
+ #
49
+ attr_writer :http_client
50
+
51
+ #
52
+ # @return [RestClient]
53
+ #
54
+ def http_client
55
+ @http_client || RestClient
56
+ end
57
+
58
+ # @params [String] url
59
+ def get(url)
60
+ html = http_client.get(url)
61
+ parse(url, html)
62
+ end
63
+
64
+ def to_mapper(mapper_json)
65
+ MapperExporter.to_mapper(JSON.parse(mapper_json, { symbolize_names: true }))
66
+ end
67
+ end
68
+
69
+ extend ModuleMethods
70
+
71
+ module ClassMethods
72
+ attr_reader :collections, :default_collection
73
+
74
+ def domains(*args)
75
+ args.each { |domain| Parsers.add(self, domain) }
76
+ @domains = args
77
+ end
78
+
79
+ def domain_list
80
+ @domains
81
+ end
82
+
83
+ def collection(name, selector, options = {})
84
+ name = name.to_sym
85
+ @current_collection = @collections[name] = Collection.new(name, selector, options)
86
+
87
+ yield if block_given?
88
+
89
+ @current_collection = nil
90
+ end
91
+
92
+ def field(name, selector, options = {})
93
+ current_collection.new_field(name, selector, options)
94
+ end
95
+
96
+ def has_many(name, klass, options = {})
97
+ current_collection.new_relation(name, klass, options.merge!(many: true))
98
+ end
99
+
100
+ def has_one(name, klass, options = {})
101
+ current_collection.new_relation(name, klass, options.merge!(many: false))
102
+ end
103
+
104
+ def parse(doc, url = nil)
105
+ doc = Nokogiri::HTML.parse(doc) if doc.is_a?(String)
106
+ obj = new
107
+ obj.crawl_url = url
108
+
109
+ @collections.each do |name, collection|
110
+ obj[name] = collection.process(doc, obj)
111
+ end
112
+
113
+ if @default_collection
114
+ obj[@default_collection.name] = @default_collection.process(doc, obj)
115
+ end
116
+
117
+ @callbacks.each { |c| obj.send(c) } if @callbacks
118
+
119
+ obj
120
+ end
121
+
122
+ def get(url, html = nil)
123
+ html = HtmlMapper.http_client.get(url) unless html
124
+ parse(Nokogiri::HTML.parse(html), url)
125
+ end
126
+
127
+ def after_process(*args)
128
+ @callbacks ||= []
129
+ args.each { |callback| @callbacks << callback.to_sym }
130
+ end
131
+
132
+ def as_json
133
+ MapperExporter.export(self)
134
+ end
135
+
136
+ def to_json
137
+ as_json.to_json
138
+ end
139
+
140
+ #
141
+ # @param [String] dir
142
+ # Output directory name
143
+ # @param [String] url
144
+ # Web page url
145
+ # @param [String] html
146
+ # Optional
147
+ #
148
+ def export_mapper_with_data(dir, url, html = nil)
149
+ file = File.join(dir, to_s)
150
+
151
+ File.write("#{file}.mapper", JSON.pretty_generate(as_json))
152
+ data = get(url, html)
153
+ File.write("#{file}.data", data.to_json)
154
+ end
155
+
156
+ private
157
+
158
+ def current_collection
159
+ @current_collection ||
160
+ (@default_collection ||= Collection.new(:_default, '.', {}))
161
+ end
162
+ end
163
+
164
+ module InstanceMethods
165
+ # @return [String, nil] Url
166
+ attr_accessor :crawl_url
167
+
168
+ def initialize
169
+ @values = {}
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,71 @@
1
+ module HtmlMapper
2
+ class Collection
3
+ attr_accessor :selector, :name, :fields, :relations, :options
4
+
5
+ def initialize(name, selector, options)
6
+ @selector = selector
7
+ @name = name.to_sym
8
+ @fields = []
9
+ @relations = []
10
+ @options = options
11
+ end
12
+
13
+ def process(doc, obj)
14
+ eles = doc.search(selector).reject { |ele| exec_reject_if(ele, obj) }
15
+
16
+ if options[:single]
17
+ find(eles.first, obj)
18
+ else
19
+ eles.map { |ele| find(ele, obj) }
20
+ end
21
+ end
22
+
23
+ def find(doc, obj)
24
+ return nil if doc.nil?
25
+
26
+ result = Result.new(name)
27
+ result.parent = obj
28
+
29
+ @fields.each do |field|
30
+ result[field.name] = field.find(doc, obj)
31
+ end
32
+
33
+ @relations.each do |relation|
34
+ relation.parse(doc, result)
35
+ end
36
+
37
+ result
38
+ end
39
+
40
+ def new_field(name, selector, options)
41
+ Field.new(name, selector, options).tap do |field|
42
+ @fields << field
43
+ end
44
+ end
45
+
46
+ def new_relation(name, klass, options)
47
+ @relations << Relation.new(name, klass, options)
48
+ end
49
+
50
+ def exec_reject_if(ele, obj)
51
+ return false if options[:reject_if].nil?
52
+
53
+ if options[:reject_if].is_a?(Symbol)
54
+ obj.send(options[:reject_if], ele)
55
+ else
56
+ options[:reject_if].call(ele)
57
+ end
58
+ end
59
+
60
+ def as_json
61
+ {
62
+ name: name,
63
+ selector: selector,
64
+ options: options,
65
+ fields: fields.map(&:as_json),
66
+ relations: relations.map(&:as_json)
67
+ }
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,13 @@
1
+ require 'csv'
2
+
3
+ module HtmlMapper
4
+ module DataExporter
5
+
6
+ def csv_fields(*args)
7
+ @csv_fields = args
8
+ end
9
+
10
+ def to_csv
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,63 @@
1
+ module HtmlMapper
2
+ class Field
3
+ attr_accessor :name, :type, :selector, :options
4
+
5
+ def initialize(name, selector, options = {})
6
+ @name = name
7
+ @selector = selector
8
+ @options = options
9
+ end
10
+
11
+ def find(doc, obj)
12
+ eles = doc.search(selector)
13
+
14
+ if options[:all]
15
+ eles.map{|ele| find_values(obj, ele)}.compact
16
+ else
17
+ find_values(obj, eles.first)
18
+ end
19
+ end
20
+
21
+ def typecast(value)
22
+ SupportedTypes.types[options[:as]].apply(value)
23
+ end
24
+
25
+ def as_json
26
+ h = { name: name, selector: selector }
27
+
28
+ if options.any?
29
+ h[:options] = options.dup
30
+ h[:options][:as] = options[:as].name if options[:as]
31
+ h[:options].delete(:eval) if options[:eval].is_a?(Proc)
32
+ end
33
+
34
+ return h
35
+ end
36
+
37
+ private
38
+
39
+ def find_values(obj, ele)
40
+ if ele
41
+ value = process_ele(ele, obj)
42
+ options[:as] ? typecast(value) : value
43
+ end
44
+ end
45
+
46
+ def process_ele(ele, obj)
47
+ value = if options[:attribute]
48
+ ele.attributes[options[:attribute]].to_s
49
+ else
50
+ ele.content
51
+ end
52
+
53
+ value.strip!
54
+ return value unless options[:eval]
55
+
56
+ if options[:eval].is_a?(Symbol)
57
+ obj.send(options[:eval], value, ele)
58
+ else
59
+ options[:eval].call(value, ele)
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,60 @@
1
+ module HtmlMapper
2
+ module MapperExporter
3
+ module_function
4
+
5
+ def export(klass)
6
+ mapper = {
7
+ domains: klass.domain_list,
8
+ collections: klass.collections.map { |_, c| c.as_json }
9
+ }
10
+
11
+ if klass.default_collection
12
+ mapper[:collections] << klass.default_collection.as_json
13
+ end
14
+
15
+ mapper
16
+ end
17
+
18
+ def new_scraper(domain_list = nil)
19
+ Class.new do
20
+ include HtmlMapper
21
+ domains(domain_list) if domain_list
22
+ end
23
+ end
24
+
25
+ def to_mapper(mapper)
26
+ klass = new_scraper(mapper[:domains])
27
+
28
+ mapper[:collections].map do |c|
29
+ klass.collections[c[:name].to_sym] = build_collection(c)
30
+ end
31
+
32
+ klass
33
+ end
34
+
35
+ def build_collection(data)
36
+ data[:options] = {} unless data[:options]
37
+ collection = Collection.new(data[:name].to_sym, data[:selector], data[:options])
38
+
39
+ data[:fields].each { |f| add_field(collection, f) }
40
+ data[:relations].each { |r| add_realtion(collection, r) }
41
+
42
+ collection
43
+ end
44
+
45
+ def add_field(collection, field)
46
+ field[:options] = {} unless field[:options]
47
+
48
+ if field[:options][:as]
49
+ field[:options][:as] = SupportedTypes.find_by_name(field[:options][:as])
50
+ end
51
+
52
+ collection.new_field(field[:name], field[:selector], field[:options])
53
+ end
54
+
55
+ def add_realtion(collection, relation)
56
+ relation_klass = to_mapper(relation[:mapper])
57
+ collection.new_relation(relation[:name].to_sym, relation_klass, relation[:options] || {})
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,40 @@
1
+ module HtmlMapper
2
+ module ObjectHelper
3
+ def [](f)
4
+ @values[f]
5
+ end
6
+
7
+ def []=(f, v)
8
+ @values[f] = v
9
+ end
10
+
11
+ def to_s(*_args)
12
+ JSON.pretty_generate(@values)
13
+ end
14
+
15
+ def to_json(*_args)
16
+ JSON.generate(@values)
17
+ end
18
+
19
+ def to_hash
20
+ @values
21
+ end
22
+
23
+ def as_json(*args)
24
+ @values.as_json(*args)
25
+ end
26
+
27
+ # Yields each field value, one at a time.
28
+ def each(&blk)
29
+ @values.each(&blk)
30
+ end
31
+
32
+ def inspect
33
+ "#<#{self.class}:0x#{object_id.to_s(16)}:#{@name}> JSON: #{JSON.pretty_generate(@values)}"
34
+ end
35
+
36
+ def method_missing(name, *_args, &_block)
37
+ @values[name]
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,34 @@
1
+ module HtmlMapper
2
+ class Parsers
3
+ class << self
4
+ attr_accessor :parsers, :regx_parsers
5
+
6
+ def add(klass, domain)
7
+ if domain.is_a?(Regexp)
8
+ (@regx_parsers[domain] ||= []) << klass
9
+ elsif domain.is_a?(String)
10
+ host = URI(domain).host || domain
11
+ (@parsers[host] ||= []) << klass
12
+ end
13
+ end
14
+
15
+ def get(url)
16
+ host = URI(url).host
17
+
18
+ parser = @parsers[host]
19
+ return parser if parser
20
+
21
+ parser = @regx_parsers.find { |k, _| k =~ url }
22
+ return parser.last if parser
23
+ end
24
+
25
+ def each(&blk)
26
+ @parsers.each(&blk)
27
+ @regx_parsers.each(&blk)
28
+ end
29
+ end
30
+
31
+ self.parsers = {}
32
+ self.regx_parsers = {}
33
+ end
34
+ end
@@ -0,0 +1,4 @@
1
+ module HtmlMapper
2
+ class Phoenix
3
+ end
4
+ end
@@ -0,0 +1,43 @@
1
+ module HtmlMapper
2
+ class Relation
3
+ attr_reader :name, :klass, :options
4
+
5
+ def initialize(name, klass, options = {})
6
+ @name = name.to_sym
7
+ @klass = klass.is_a?(String) ? string_to_constant(klass) : klass
8
+ @options = options
9
+ end
10
+
11
+ def parse(doc, parent)
12
+ klass.parse(doc).tap do |obj|
13
+ parent[name] = obj
14
+ obj.parent = parent
15
+ end
16
+ end
17
+
18
+ def as_json
19
+ {
20
+ name: name,
21
+ klass: klass.name,
22
+ options: options,
23
+ mapper: klass.as_json
24
+ }
25
+ end
26
+
27
+ private
28
+
29
+ def string_to_constant(type)
30
+ names = type.split('::')
31
+ constant = Object
32
+ names.each do |name|
33
+ constant = if constant.const_defined?(name)
34
+ constant.const_get(name)
35
+ else
36
+ constant.const_missing(name)
37
+ end
38
+ end
39
+ constant
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,13 @@
1
+ module HtmlMapper
2
+ class Result
3
+ include ObjectHelper
4
+
5
+ attr_reader :values, :_name
6
+ attr_accessor :parent
7
+
8
+ def initialize(name)
9
+ @_name = name
10
+ @values = {}
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,102 @@
1
+ module HtmlMapper
2
+ class Boolean; end
3
+
4
+ module SupportedTypes
5
+ module_function
6
+
7
+ def types
8
+ @types ||= {}
9
+ end
10
+
11
+ def register_type(type, typecaster_obj = nil, &block)
12
+ types[type] = typecaster_obj || CastWhenType.new(type, &block)
13
+ end
14
+
15
+ def find_by_name(type)
16
+ typecaster = SupportedTypes.types.find{|k, _| k.name == type }
17
+ typecaster ? typecaster.first : nil
18
+ end
19
+
20
+ class CastWhenType
21
+ attr_reader :type
22
+
23
+ def initialize(type, &block)
24
+ @type = type
25
+ @apply_block = block || no_operation
26
+ end
27
+
28
+ def no_operation
29
+ ->(value) { value }
30
+ end
31
+
32
+ def apply?(_value, convert_to_type)
33
+ convert_to_type == type
34
+ end
35
+
36
+ def apply(value)
37
+ @apply_block.call(value)
38
+ end
39
+ end
40
+
41
+ class NilOrAlreadyConverted
42
+ def type
43
+ NilClass
44
+ end
45
+
46
+ def apply?(value, convert_to_type)
47
+ value.is_a?(convert_to_type) || value.nil?
48
+ end
49
+
50
+ def apply(value)
51
+ value
52
+ end
53
+ end
54
+
55
+ register_type NilOrAlreadyConverted, NilOrAlreadyConverted.new
56
+
57
+ register_type String do |value|
58
+ value.to_s
59
+ end
60
+
61
+ register_type Time do |value|
62
+ Time.parse(value.to_s) rescue Time.at(value.to_i)
63
+ end
64
+
65
+ register_type Date do |value|
66
+ Date.parse(value.to_s)
67
+ end
68
+
69
+ register_type DateTime do |value|
70
+ DateTime.parse(value.to_s)
71
+ end
72
+
73
+ BOOL_TYPES = %w(true t 1)
74
+
75
+ register_type Boolean do |value|
76
+ BOOL_TYPES.include?(value.to_s.downcase)
77
+ end
78
+
79
+ DIGIT_REGX = /^\d+/
80
+
81
+ register_type Integer do |value|
82
+ value_to_i = value.to_i
83
+
84
+ if value_to_i == 0 && !(value.to_s =~ DIGIT_REGX)
85
+ nil
86
+ else
87
+ value_to_i
88
+ end
89
+ end
90
+
91
+ register_type Float do |value|
92
+ value_to_f = value.to_f
93
+
94
+ if value_to_f == 0.0 && !(value.to_s =~ DIGIT_REGX)
95
+ nil
96
+ else
97
+ value_to_f
98
+ end
99
+ end
100
+
101
+ end
102
+ end
@@ -0,0 +1,3 @@
1
+ module HtmlMapper
2
+ VERSION = '0.1.0'
3
+ end
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_mapper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jiren
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-01-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.10.4
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 1.10.4
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 1.5.5
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 1.5.5
69
+ - !ruby/object:Gem::Dependency
70
+ name: rest-client
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Parse html and map to ruby object or hash
84
+ email:
85
+ - jirenpatel@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - ".ruby-version"
93
+ - ".travis.yml"
94
+ - ".yardopts"
95
+ - CODE_OF_CONDUCT.md
96
+ - Gemfile
97
+ - LICENSE.txt
98
+ - README.md
99
+ - Rakefile
100
+ - bin/console
101
+ - bin/setup
102
+ - html_mapper.gemspec
103
+ - lib/generators/html_mapper/scraper_generator.rb
104
+ - lib/generators/html_mapper/templates/scraper.rb.erb
105
+ - lib/generators/html_mapper/templates/scraper_spec.rb.erb
106
+ - lib/generators/html_mapper/templates/scraper_test.rb.erb
107
+ - lib/html_mapper.rb
108
+ - lib/html_mapper/collection.rb
109
+ - lib/html_mapper/data_exporter.rb
110
+ - lib/html_mapper/field.rb
111
+ - lib/html_mapper/mapper_exporter.rb
112
+ - lib/html_mapper/object_helper.rb
113
+ - lib/html_mapper/parsers.rb
114
+ - lib/html_mapper/phoenix.rb
115
+ - lib/html_mapper/relation.rb
116
+ - lib/html_mapper/result.rb
117
+ - lib/html_mapper/supported_types.rb
118
+ - lib/html_mapper/version.rb
119
+ homepage: https://github.com/jiren/html_mapper
120
+ licenses:
121
+ - MIT
122
+ metadata: {}
123
+ post_install_message:
124
+ rdoc_options: []
125
+ require_paths:
126
+ - lib
127
+ required_ruby_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubyforge_project:
139
+ rubygems_version: 2.4.5
140
+ signing_key:
141
+ specification_version: 4
142
+ summary: HTML to ruby object or hash
143
+ test_files: []
144
+ has_rdoc: