apify_core 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 39b891973e8c30065d8136d0b61f4fea6c4c8328
4
+ data.tar.gz: e2ce4293075a8bb218660a0e0546e86e51559b90
5
+ SHA512:
6
+ metadata.gz: fdd15b69af5f5068a7f2fadc8afd2f746c074c757fb979569a65eeff525cca225472f21814b1869a20c147f5df3935ad50d40ab61f6a14d82009f50009077c9b
7
+ data.tar.gz: 57899e4ff289ead9ced7b376b345d4fd9f0b4b1f9b0a53221237724e2510c96bf225e6e44e1694317396bcee20caec8773c5e0fe7b044c02ddfb496e02635fb0
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in apify.gemspec
4
+ gemspec
5
+
6
+ gem 'activesupport', require: 'active_support/all'
7
+ gem 'watir-webdriver', '~> 0.6.11'
8
+ gem 'rest_client', '~> 1.8.2'
9
+ gem 'headless', '~> 1.0.2'
10
+ gem 'parallel', '~> 1.3.3'
11
+ gem 'nokogiri', '~> 1.6.5'
12
+
13
+ group :development do
14
+ gem 'awesome_print', '~> 1.2.0'
15
+ gem 'pry'
16
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,79 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ apify_core (0.0.1)
5
+ activesupport
6
+ headless
7
+ json
8
+ nokogiri (~> 1.6.5)
9
+ parallel
10
+ rest_client
11
+ watir-webdriver
12
+
13
+ GEM
14
+ remote: https://rubygems.org/
15
+ specs:
16
+ activesupport (4.2.0)
17
+ i18n (~> 0.7)
18
+ json (~> 1.7, >= 1.7.7)
19
+ minitest (~> 5.1)
20
+ thread_safe (~> 0.3, >= 0.3.4)
21
+ tzinfo (~> 1.1)
22
+ awesome_print (1.2.0)
23
+ childprocess (0.5.5)
24
+ ffi (~> 1.0, >= 1.0.11)
25
+ diff-lcs (1.2.5)
26
+ ffi (1.9.6)
27
+ headless (1.0.2)
28
+ i18n (0.7.0)
29
+ json (1.8.1)
30
+ mini_portile (0.6.2)
31
+ minitest (5.5.0)
32
+ multi_json (1.10.1)
33
+ netrc (0.7.9)
34
+ nokogiri (1.6.5)
35
+ mini_portile (~> 0.6.0)
36
+ parallel (1.3.3)
37
+ rake (10.4.2)
38
+ rest_client (1.8.2)
39
+ netrc (~> 0.7.7)
40
+ rspec (3.0.0)
41
+ rspec-core (~> 3.0.0)
42
+ rspec-expectations (~> 3.0.0)
43
+ rspec-mocks (~> 3.0.0)
44
+ rspec-core (3.0.4)
45
+ rspec-support (~> 3.0.0)
46
+ rspec-expectations (3.0.4)
47
+ diff-lcs (>= 1.2.0, < 2.0)
48
+ rspec-support (~> 3.0.0)
49
+ rspec-mocks (3.0.4)
50
+ rspec-support (~> 3.0.0)
51
+ rspec-support (3.0.4)
52
+ rubyzip (1.1.6)
53
+ selenium-webdriver (2.44.0)
54
+ childprocess (~> 0.5)
55
+ multi_json (~> 1.0)
56
+ rubyzip (~> 1.0)
57
+ websocket (~> 1.0)
58
+ thread_safe (0.3.4)
59
+ tzinfo (1.2.2)
60
+ thread_safe (~> 0.1)
61
+ watir-webdriver (0.6.11)
62
+ selenium-webdriver (>= 2.18.0)
63
+ websocket (1.2.1)
64
+
65
+ PLATFORMS
66
+ ruby
67
+
68
+ DEPENDENCIES
69
+ activesupport
70
+ apify_core!
71
+ awesome_print (~> 1.2.0)
72
+ bundler (~> 1.7)
73
+ headless (~> 1.0.2)
74
+ nokogiri (~> 1.6.5)
75
+ parallel (~> 1.3.3)
76
+ rake (~> 10.0)
77
+ rest_client (~> 1.8.2)
78
+ rspec (~> 3.0.0)
79
+ watir-webdriver (~> 0.6.11)
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 victorvsk
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,43 @@
1
+ # Apify Core
2
+
3
+ Apify Core is a part of Apify Project. Parse HTML\XML to JSON with easy API and useful filters.
4
+ Apify Project allows even more - parsing entire website with east.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'apify_core'
12
+ ```
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install apify_core
21
+
22
+ ## Usage
23
+
24
+ ```
25
+ html = RestClient.get('http://github.com')
26
+ pattern = { title: '<% title %>' }
27
+ title = Apify::Core.new(html, pattern).perform # GitHub · Build software better, together.
28
+ ```
29
+
30
+ ```
31
+ request = { github: { url: ['http://github.com'], js: false, host: 'http://github.com', pattern: { title: '<% title %>' } } }
32
+ response = Apify.crawl!(request) # { "github": { "title": "GitHub · Build software better, together." } }
33
+ ```
34
+
35
+ See more in documentation (TODO). Also some syntax examples can be found in spec/examples.
36
+
37
+ ## Contributing
38
+
39
+ 1. Fork it ( https://github.com/victorvsk/apify-core/fork )
40
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
41
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
42
+ 4. Push to the branch (`git push origin my-new-feature`)
43
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path('./Gemfile', __FILE__)
2
+ require 'rubygems'
3
+ require "bundler/gem_tasks"
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'apify_core/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "apify_core"
8
+ spec.version = Apify::Core::VERSION
9
+ spec.authors = ["victorvsk"]
10
+ spec.email = ["victor@vyskrebentsev.ru"]
11
+ spec.summary = %q{Core part of Apify project. An easy way to parse HTML\XML content and crawl websites in a normalized and centralized way.}
12
+ spec.description = %q{Simple API to transform from simple HTML to JSON to entire website to JSON.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec", "~> 3.0.0"
24
+
25
+
26
+ spec.add_dependency 'watir-webdriver'
27
+ spec.add_dependency 'rest_client'
28
+ spec.add_dependency 'headless'
29
+ spec.add_dependency 'parallel'
30
+ spec.add_dependency 'nokogiri', '~> 1.6.5'
31
+ spec.add_dependency 'json'
32
+ spec.add_dependency 'activesupport'
33
+
34
+ end
data/bin/bundler ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'bundler' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('bundler', 'bundler')
data/bin/coderay ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'coderay' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('coderay', 'coderay')
data/bin/htmldiff ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'htmldiff' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('diff-lcs', 'htmldiff')
data/bin/ldiff ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'ldiff' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('diff-lcs', 'ldiff')
data/bin/nokogiri ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'nokogiri' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('nokogiri', 'nokogiri')
data/bin/pry ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'pry' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('pry', 'pry')
data/bin/rackup ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'rackup' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('rack', 'rackup')
data/bin/rake ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'rake' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('rake', 'rake')
data/bin/rspec ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'rspec' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('rspec-core', 'rspec')
data/bin/server ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'server' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('apify', 'server')
data/bin/tilt ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'tilt' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
12
+
13
+ require 'rubygems'
14
+ require 'bundler/setup'
15
+
16
+ load Gem.bin_path('tilt', 'tilt')
@@ -0,0 +1,190 @@
1
+ module Apify
2
+ module Core
3
+ class Fetcher
4
+ attr_accessor :sources
5
+ attr_accessor :result
6
+
7
+ def initialize( pages, processes=2, delay=0 )
8
+ @pages = pages
9
+ @processes = processes
10
+ @delay = delay
11
+ end
12
+
13
+ def prepare
14
+ @pages.each do |key, value|
15
+ value[:url] = Filter.apply(value[:url], ['map_urlencode'])
16
+ self.class.send(:attr_reader, key.to_sym)
17
+ js = value[:js] || false
18
+ pattern = value[:pattern]
19
+ host = value[:host]
20
+ if value[:from]
21
+ v = value
22
+ instance_variable_set("@#{key}".to_sym, v)
23
+ next
24
+ end
25
+ url = self.class.base_url_for(value[:url], value[:host])
26
+
27
+ if value[:paginate]
28
+ pages = self.class.paginate(url: url, to_replace: value[:paginate][0], pagination: value[:paginate][1])
29
+ result = { pages: pages }
30
+ else
31
+ url = (url.respond_to?(:each) ? url : url.to_s)
32
+ result = { pages: [url] }
33
+ end
34
+ result[:js] = js
35
+ result[:pattern] = pattern
36
+ result[:host] = host
37
+ instance_variable_set("@#{key}".to_sym, result)
38
+
39
+ end
40
+ self.sources = @pages.keys
41
+ end
42
+
43
+ def perform
44
+ parenthesis_args = /\([\"\'](.*?)[\"\']\)/
45
+ self.sources.each do |source|
46
+
47
+ if @pages[source].key?(:from)
48
+ expression = @pages[source][:from]
49
+ statement = {
50
+ select: expression.match(/select#{parenthesis_args}/)[1],
51
+ from: expression.match(/from#{parenthesis_args}/)[1],
52
+ filters: (expression.match(/filter#{parenthesis_args}/)[1].split('|').map(&:strip) rescue nil),
53
+ }
54
+ statement[:filters] = ['mapattr_href', 'map_urlencode'] unless statement[:filters].present?
55
+ urls = Filter.apply(Parser.fetch(statement[:select], self.send(statement[:from])[:pages]), statement[:filters])
56
+ @pages[source][:pages] = self.class.base_url_for(urls, @pages[source][:host])
57
+ end
58
+ src = self.send(source)
59
+ method = src[:js] ? :js : :normal
60
+ processes_number = (method == :js ? 1 : @processes )
61
+ src[:pages] = ::Parallel.map(src[:pages], in_processes: processes_number) do |url_or_array|
62
+ if url_or_array.respond_to?(:each)
63
+ res = []
64
+ url_or_array.each do |url|
65
+
66
+ res << self.class.download( url, method )
67
+ end
68
+ res
69
+ else
70
+ self.class.download( url_or_array, method )
71
+ end
72
+ end.flatten
73
+ end
74
+
75
+ result = {}
76
+ self.sources.each do |source|
77
+ src = self.send(source)
78
+ pattern = src[:pattern] ? src[:pattern].dup : false
79
+ elem = if src[:pattern]
80
+ src[:pages].map{ |html| Parser.new(html, src[:pattern]).perform }
81
+ else
82
+
83
+ src[:pages]
84
+ end
85
+
86
+ result[source] = elem if pattern
87
+ instance_variable_set("@#{source}".to_sym, elem )
88
+ end
89
+
90
+ @result = result
91
+ @json = result
92
+ end
93
+
94
+ def to_json
95
+ (@json || perform).to_json
96
+ end
97
+
98
+ class << self
99
+
100
+ # Fetcher::Fetcher.paginate(url: 'http://site.com', to_replace: '(\/?)$', pagination: '?page=<% 1,5,1 %>')
101
+
102
+ def paginate( opts={} )
103
+ pagination = opts[:pagination] || '?page=<% 1,5,1 %>'
104
+ to_replace = opts[:to_replace] || '(\/?)\Z'
105
+ url_or_array_of_urls = opts[:url]
106
+ raise ArgumentError, "URL parameter missing" if url_or_array_of_urls.nil?
107
+ regexp = /<%\s?+(\d+,\d+,\d+)\s?+%>/
108
+ pattern = pagination.scan(regexp)
109
+ return [opts[:url]] if pattern.count == 0
110
+ raise ArgumentError, "Only one pagination pattern allowed." if pattern.count > 1
111
+ result = []
112
+ pager_args = pattern.first.first.split(',').map(&:strip).map(&:to_i)
113
+ range = (pager_args[0]..pager_args[1])
114
+ range.step(pager_args[2]).each do |page|
115
+
116
+ to_append = pagination.gsub(regexp, page.to_s)
117
+ if url_or_array_of_urls.respond_to?(:each)
118
+ url_or_array_of_urls.each do |url|
119
+ result << url.to_s.chomp('/').gsub(Regexp.new(to_replace), to_append)
120
+ end
121
+ else
122
+ result << url_or_array_of_urls.to_s.chomp('/').gsub(Regexp.new(to_replace), to_append)
123
+ end
124
+
125
+ end
126
+
127
+ result
128
+ end
129
+
130
+ def download( url, method=:normal )
131
+ sleep @delay if @delay.to_i > 0
132
+ result = case method
133
+ when :js
134
+ headless = Headless.new
135
+ headless.start
136
+ browser = Watir::Browser.new
137
+ browser.goto url
138
+ html = browser.html
139
+ headless.destroy
140
+ print "+"
141
+ html
142
+ when :normal
143
+ begin
144
+ html = RestClient.get(url,
145
+ 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
146
+ 'Accept-Language' => 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
147
+ 'Connection' => 'keep-alive',
148
+ 'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/39.0.2171.65 Chrome/39.0.2171.65 Safari/537.36')
149
+ d = Nokogiri::HTML(html)
150
+ charset = d.search('meta[@http-equiv="content-type"]').last['content'].match(/charset=(.+)/)[1].downcase rescue nil
151
+ if charset.present? and charset != 'utf-8'
152
+ html = html.force_encoding(charset).encode("utf-8", undef: :replace)
153
+ end
154
+ print "+"
155
+ html
156
+ rescue RestClient::RequestTimeout, RestClient::ResourceNotFound, RestClient::InternalServerError, URI::InvalidURIError, RestClient::Forbidden,RestClient::BadGateway, RestClient
157
+ print "-"
158
+ return
159
+ end
160
+ end
161
+
162
+ result
163
+ end
164
+
165
+ def base_url_for(url_or_array, base_url)
166
+
167
+ if url_or_array.respond_to?(:each)
168
+ result = []
169
+ url_or_array.each do |url|
170
+ #url = URI(URI.encode(url))
171
+ url = URI(url)
172
+ raise ArgumentError, "No host provided." if url.host.nil? and base_url.nil?
173
+ result << (url.host ? url.to_s : "#{base_url}#{url}")
174
+ end
175
+ result
176
+ else
177
+ #url = URI(URI.encode(url_or_array))
178
+ url = URI(url_or_array)
179
+ raise ArgumentError, "No host provided." if url.host.nil? and base_url.nil?
180
+ (url.host ? url : "#{base_url}#{url}")
181
+ end
182
+
183
+ end
184
+
185
+ end
186
+
187
+
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,83 @@
1
+ module Apify
2
+ module Core
3
+ module Filter
4
+ class << self
5
+
6
+ def apply(node_or_str, filters=[])
7
+ return node_or_str if filters.empty? or node_or_str.nil?
8
+ method = filters.shift
9
+ filtered_value = send(method, node_or_str)
10
+ apply(filtered_value, filters)
11
+ end
12
+
13
+ private
14
+
15
+ def first(node)
16
+ node.first
17
+ end
18
+
19
+ def text(node)
20
+ node.text
21
+ end
22
+
23
+ def strip(str)
24
+ str.strip if str
25
+ end
26
+
27
+ def list(node)
28
+ node
29
+ end
30
+
31
+ def html(node)
32
+ node.to_s
33
+ end
34
+
35
+ def inner_html(node)
36
+ node.inner_html.to_s
37
+ end
38
+
39
+ def map_text(node)
40
+ node.map(&:text).map(&:strip)
41
+ end
42
+
43
+ def map_html(node)
44
+ node.map(&:to_s)
45
+ end
46
+
47
+ def map_inner_html(node)
48
+ node.map(&:inner_html).map(&:to_s)
49
+ end
50
+
51
+
52
+ def map_urlencode(node)
53
+ node.map do |url|
54
+ urlencode(url)
55
+ end
56
+ end
57
+
58
+ def urlencode(url)
59
+ url = begin
60
+ url = URI(url)
61
+ url
62
+ rescue URI::InvalidURIError
63
+ URI.encode(url)
64
+ end
65
+ end
66
+
67
+ def method_missing(method_sym, *arguments, &block)
68
+ if method_sym =~ /\Amapattr_/
69
+ attribute = method_sym.to_s.gsub('mapattr_', '')
70
+ arguments.first.map{ |n| n[attribute] }
71
+ elsif method_sym =~ /\Aattr_/
72
+ attribute = method_sym.to_s.gsub('attr_', '')
73
+ arguments.first[attribute]
74
+ else
75
+ super
76
+ end
77
+ end
78
+
79
+
80
+ end
81
+ end
82
+ end
83
+ end