autopagerize 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in autopagerize.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 uu59
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,65 @@
1
+ # Autopagerize gem
2
+
3
+ Autopagerize gem works as such extensions:
4
+
5
+ - [AutoPager Chrome](https://chrome.google.com/webstore/detail/autopager-chrome/mmgagnmbebdebebbcleklifnobamjonh?hl=ja)
6
+ - [AutoPagerize](https://chrome.google.com/webstore/detail/autopagerize/igiofjhpmpihnifddepnpngfjhkfenbp)
7
+ - [AutoPatchWork](https://chrome.google.com/webstore/detail/autopatchwork/aeolcjbaammbkgaiagooljfdepnjmkfd?hl=ja)
8
+
9
+ ## Usage
10
+
11
+ You must download SITEINFO from <http://wedata.net/databases/AutoPagerize/items_all.json>
12
+
13
+ $ curl http://wedata.net/databases/AutoPagerize/items_all.json > siteinfo.json
14
+
15
+ require "rubygems"
16
+ require "multi_json"
17
+ require "autopagerize"
18
+
19
+ siteinfo = MultiJson.load(File.read("siteinfo.json"))
20
+ url = "http://www.google.com/search?q=test"
21
+ page = Autopagerize.new(url, siteinfo)
22
+ puts page.processed_html
23
+
24
+ ## Advance Usage
25
+
26
+ require "rubygems"
27
+ require "multi_json"
28
+ require "autopagerize"
29
+
30
+ siteinfo = MultiJson.load(File.read("siteinfo.json"))
31
+ url = "http://apod.nasa.gov/apod/ap090903.html"
32
+ apod = Autopagerize.new(url, siteinfo, {
33
+ :maxpage => 30,
34
+ :headers => {
35
+ "User-Agent" => "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
36
+ },
37
+ })
38
+ apod.each do |page|
39
+ img = page.page_element.at_xpath('//a/img/..')
40
+ next unless img
41
+ puts URI.join(url, img.attributes["href"].to_s).to_s
42
+ end
43
+
44
+ ## Installation
45
+
46
+ Add this line to your application's Gemfile:
47
+
48
+ gem 'autopagerize'
49
+
50
+ And then execute:
51
+
52
+ $ bundle
53
+
54
+ Or install it yourself as:
55
+
56
+ $ gem install autopagerize
57
+
58
+
59
+ ## Contributing
60
+
61
+ 1. Fork it
62
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
63
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
64
+ 4. Push to the branch (`git push origin my-new-feature`)
65
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'autopagerize/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "autopagerize"
8
+ gem.version = Autopagerize::VERSION
9
+ gem.authors = ["uu59"]
10
+ gem.email = ["k@uu59.org"]
11
+ gem.description = %q{Concat paginated web pages to single page}
12
+ gem.summary = %q{Concat paginated web pages to single page}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_dependency "addressable"
20
+ gem.add_dependency "httpclient"
21
+ gem.add_dependency "nokogiri"
22
+ gem.add_development_dependency "rspec"
23
+ gem.add_development_dependency "multi_json"
24
+ end
@@ -0,0 +1,113 @@
1
+ #require "autopagerize/version"
2
+ require "addressable/uri"
3
+ require "nokogiri"
4
+ require "httpclient"
5
+
6
+ class Autopagerize
7
+ include Enumerable
8
+
9
+ attr_reader :url, :client, :siteinfo, :options
10
+
11
+ def initialize(url, siteinfo, options = {})
12
+ @url = url
13
+ @siteinfo = siteinfo
14
+
15
+ @options = {
16
+ :maxpage => 10,
17
+ :headers => {},
18
+
19
+ # for internal/test use
20
+ :current_page => 1,
21
+ :httpclient => nil,
22
+ :site => nil,
23
+ }.merge(options)
24
+
25
+ @site = @options[:site]
26
+ end
27
+
28
+ def nextlink
29
+ return nil unless site
30
+ node = document.at_xpath(site["data"]["nextLink"])
31
+ return nil unless node
32
+ Addressable::URI.join(url, node.attributes["href"].to_s).to_s
33
+ end
34
+
35
+ def next
36
+ return nil if options[:maxpage] <= options[:current_page]
37
+ return nil if nextlink.nil?
38
+ @next ||= Autopagerize.new(nextlink, siteinfo, options.merge(:current_page => options[:current_page] + 1, :site => site))
39
+ end
40
+
41
+ def each
42
+ current = self
43
+ yield current
44
+ while current = current.next
45
+ yield current
46
+ end
47
+ end
48
+
49
+ def processed_document
50
+ @processed_document ||= begin
51
+ result = document.dup
52
+
53
+ # Insert rule:
54
+ # https://autopagerize.jottit.com/details_of_siteinfo_(ja)
55
+ before = site["data"]["insertBefore"]
56
+ if before.nil? || before.length == 0 || result.at_xpath(before).nil?
57
+ page = result.xpath(site["data"]["pageElement"]).last
58
+ point = Nokogiri::XML::Node.new("dummy_for_autopagerize", result.document)
59
+ page.after point
60
+ else
61
+ point = result.at_xpath(before)
62
+ end
63
+
64
+ @processed_page_elements = [self.page]
65
+ current = self
66
+ while current = current.next
67
+ point.before(current.page)
68
+ @processed_page_elements << current.page
69
+ end
70
+ point.remove
71
+ result
72
+ end
73
+ end
74
+
75
+ def processed_page_elements
76
+ processed_document
77
+ @processed_page_elements
78
+ end
79
+
80
+ def processed_html
81
+ processed_document.to_xml
82
+ end
83
+
84
+ def site
85
+ @site ||= siteinfo.find do |site|
86
+ /#{normalize_regex(site["data"]["url"])}/.match(url) && site["data"]["nextLink"] && document.at_xpath(site["data"]["nextLink"])
87
+ end
88
+ end
89
+
90
+ def client
91
+ options[:httpclient] ||= HTTPClient.new
92
+ end
93
+
94
+ def html
95
+ @html ||= client.get_content(url, nil, options[:headers])
96
+ end
97
+
98
+ def document
99
+ @document ||= Nokogiri::HTML.parse(html)
100
+ end
101
+
102
+ def page
103
+ document.xpath(site["data"]["pageElement"]).last
104
+ end
105
+ alias :page_element :page
106
+
107
+ private
108
+ def normalize_regex(re)
109
+ # to be quiet
110
+ # warning: nested repeat operator + and ? was replaced with '*': /^http:\/\/1stpower\.web\.fc2\.com\/comic(?:\d+)?/
111
+ re.gsub("+)?", "*)")
112
+ end
113
+ end
@@ -0,0 +1,3 @@
1
+ class Autopagerize
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,56 @@
1
+ # -- coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe Autopagerize do
6
+ let(:siteinfo) do
7
+ json = MultiJson.dump [
8
+ {
9
+ :data => {
10
+ :url => "^http://foo/",
11
+ :nextLink => "nomatch",
12
+ },
13
+ },
14
+ {
15
+ :data => {
16
+ :url => "^http://bar/",
17
+ :pageElement => "//page",
18
+ :nextLink => "//a",
19
+ }
20
+ },
21
+ {
22
+ :data => {
23
+ :url => "^http://baz/",
24
+ :pageElement => "//foobar",
25
+ :nextLink => "//a",
26
+ }
27
+ },
28
+ ]
29
+ MultiJson.load(json)
30
+ end
31
+
32
+ it "should autopagerize" do
33
+ first = <<-HTML
34
+ <!doctype html>
35
+ <page>first page</page>
36
+ <a href="next.html">next link</a>
37
+ HTML
38
+
39
+ second = <<-HTML
40
+ <!doctype html>
41
+ <page>next.html</page>
42
+ <a href="morenext.html">go to final page</a>
43
+ HTML
44
+
45
+ third = <<-HTML
46
+ <!doctype html>
47
+ <page>final page</page>
48
+ HTML
49
+
50
+ client = HTTPClient.new
51
+ client.test_loopback_response << first << second << third
52
+ a = Autopagerize.new("http://bar/", siteinfo, :httpclient => client)
53
+ a.nextlink.should == "http://bar/next.html"
54
+ a.to_enum.to_a.length.should == 3
55
+ end
56
+ end
@@ -0,0 +1,14 @@
1
+ # -- coding: utf-8
2
+
3
+ require "rubygems"
4
+ require "bundler/setup"
5
+ Bundler.require :default, :test, :development
6
+ require "rspec-expectations"
7
+ require "rspec/matchers/built_in/be"
8
+
9
+ Dir["./spec/support/**/*.rb"].each{|file| require file }
10
+
11
+ require File.expand_path("../../lib/autopagerize.rb", __FILE__)
12
+
13
+ RSpec.configure do |config|
14
+ end
metadata ADDED
@@ -0,0 +1,138 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: autopagerize
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - uu59
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-11 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: addressable
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: httpclient
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rspec
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: multi_json
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Concat paginated web pages to single page
95
+ email:
96
+ - k@uu59.org
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - .gitignore
102
+ - Gemfile
103
+ - LICENSE.txt
104
+ - README.md
105
+ - Rakefile
106
+ - autopagerize.gemspec
107
+ - lib/autopagerize.rb
108
+ - lib/autopagerize/version.rb
109
+ - spec/autopagerize_spec.rb
110
+ - spec/spec_helper.rb
111
+ homepage: ''
112
+ licenses: []
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ none: false
119
+ requirements:
120
+ - - ! '>='
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ none: false
125
+ requirements:
126
+ - - ! '>='
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 1.8.23
132
+ signing_key:
133
+ specification_version: 3
134
+ summary: Concat paginated web pages to single page
135
+ test_files:
136
+ - spec/autopagerize_spec.rb
137
+ - spec/spec_helper.rb
138
+ has_rdoc: