autopagerize 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in autopagerize.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 uu59
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,65 @@
1
+ # Autopagerize gem
2
+
3
+ Autopagerize gem works as such extensions:
4
+
5
+ - [AutoPager Chrome](https://chrome.google.com/webstore/detail/autopager-chrome/mmgagnmbebdebebbcleklifnobamjonh?hl=ja)
6
+ - [AutoPagerize](https://chrome.google.com/webstore/detail/autopagerize/igiofjhpmpihnifddepnpngfjhkfenbp)
7
+ - [AutoPatchWork](https://chrome.google.com/webstore/detail/autopatchwork/aeolcjbaammbkgaiagooljfdepnjmkfd?hl=ja)
8
+
9
+ ## Usage
10
+
11
+ You must download SITEINFO from <http://wedata.net/databases/AutoPagerize/items_all.json>
12
+
13
+ $ curl http://wedata.net/databases/AutoPagerize/items_all.json > siteinfo.json
14
+
15
+ require "rubygems"
16
+ require "multi_json"
17
+ require "autopagerize"
18
+
19
+ siteinfo = MultiJson.load(File.read("siteinfo.json"))
20
+ url = "http://www.google.com/search?q=test"
21
+ page = Autopagerize.new(url, siteinfo)
22
+ puts page.processed_html
23
+
24
+ ## Advance Usage
25
+
26
+ require "rubygems"
27
+ require "multi_json"
28
+ require "autopagerize"
29
+
30
+ siteinfo = MultiJson.load(File.read("siteinfo.json"))
31
+ url = "http://apod.nasa.gov/apod/ap090903.html"
32
+ apod = Autopagerize.new(url, siteinfo, {
33
+ :maxpage => 30,
34
+ :headers => {
35
+ "User-Agent" => "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
36
+ },
37
+ })
38
+ apod.each do |page|
39
+ img = page.page_element.at_xpath('//a/img/..')
40
+ next unless img
41
+ puts URI.join(url, img.attributes["href"].to_s).to_s
42
+ end
43
+
44
+ ## Installation
45
+
46
+ Add this line to your application's Gemfile:
47
+
48
+ gem 'autopagerize'
49
+
50
+ And then execute:
51
+
52
+ $ bundle
53
+
54
+ Or install it yourself as:
55
+
56
+ $ gem install autopagerize
57
+
58
+
59
+ ## Contributing
60
+
61
+ 1. Fork it
62
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
63
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
64
+ 4. Push to the branch (`git push origin my-new-feature`)
65
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'autopagerize/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "autopagerize"
8
+ gem.version = Autopagerize::VERSION
9
+ gem.authors = ["uu59"]
10
+ gem.email = ["k@uu59.org"]
11
+ gem.description = %q{Concat paginated web pages to single page}
12
+ gem.summary = %q{Concat paginated web pages to single page}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_dependency "addressable"
20
+ gem.add_dependency "httpclient"
21
+ gem.add_dependency "nokogiri"
22
+ gem.add_development_dependency "rspec"
23
+ gem.add_development_dependency "multi_json"
24
+ end
@@ -0,0 +1,113 @@
1
+ #require "autopagerize/version"
2
+ require "addressable/uri"
3
+ require "nokogiri"
4
+ require "httpclient"
5
+
6
+ class Autopagerize
7
+ include Enumerable
8
+
9
+ attr_reader :url, :client, :siteinfo, :options
10
+
11
+ def initialize(url, siteinfo, options = {})
12
+ @url = url
13
+ @siteinfo = siteinfo
14
+
15
+ @options = {
16
+ :maxpage => 10,
17
+ :headers => {},
18
+
19
+ # for internal/test use
20
+ :current_page => 1,
21
+ :httpclient => nil,
22
+ :site => nil,
23
+ }.merge(options)
24
+
25
+ @site = @options[:site]
26
+ end
27
+
28
+ def nextlink
29
+ return nil unless site
30
+ node = document.at_xpath(site["data"]["nextLink"])
31
+ return nil unless node
32
+ Addressable::URI.join(url, node.attributes["href"].to_s).to_s
33
+ end
34
+
35
+ def next
36
+ return nil if options[:maxpage] <= options[:current_page]
37
+ return nil if nextlink.nil?
38
+ @next ||= Autopagerize.new(nextlink, siteinfo, options.merge(:current_page => options[:current_page] + 1, :site => site))
39
+ end
40
+
41
+ def each
42
+ current = self
43
+ yield current
44
+ while current = current.next
45
+ yield current
46
+ end
47
+ end
48
+
49
+ def processed_document
50
+ @processed_document ||= begin
51
+ result = document.dup
52
+
53
+ # Insert rule:
54
+ # https://autopagerize.jottit.com/details_of_siteinfo_(ja)
55
+ before = site["data"]["insertBefore"]
56
+ if before.nil? || before.length == 0 || result.at_xpath(before).nil?
57
+ page = result.xpath(site["data"]["pageElement"]).last
58
+ point = Nokogiri::XML::Node.new("dummy_for_autopagerize", result.document)
59
+ page.after point
60
+ else
61
+ point = result.at_xpath(before)
62
+ end
63
+
64
+ @processed_page_elements = [self.page]
65
+ current = self
66
+ while current = current.next
67
+ point.before(current.page)
68
+ @processed_page_elements << current.page
69
+ end
70
+ point.remove
71
+ result
72
+ end
73
+ end
74
+
75
+ def processed_page_elements
76
+ processed_document
77
+ @processed_page_elements
78
+ end
79
+
80
+ def processed_html
81
+ processed_document.to_xml
82
+ end
83
+
84
+ def site
85
+ @site ||= siteinfo.find do |site|
86
+ /#{normalize_regex(site["data"]["url"])}/.match(url) && site["data"]["nextLink"] && document.at_xpath(site["data"]["nextLink"])
87
+ end
88
+ end
89
+
90
+ def client
91
+ options[:httpclient] ||= HTTPClient.new
92
+ end
93
+
94
+ def html
95
+ @html ||= client.get_content(url, nil, options[:headers])
96
+ end
97
+
98
+ def document
99
+ @document ||= Nokogiri::HTML.parse(html)
100
+ end
101
+
102
+ def page
103
+ document.xpath(site["data"]["pageElement"]).last
104
+ end
105
+ alias :page_element :page
106
+
107
+ private
108
+ def normalize_regex(re)
109
+ # to be quiet
110
+ # warning: nested repeat operator + and ? was replaced with '*': /^http:\/\/1stpower\.web\.fc2\.com\/comic(?:\d+)?/
111
+ re.gsub("+)?", "*)")
112
+ end
113
+ end
@@ -0,0 +1,3 @@
1
+ class Autopagerize
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,56 @@
1
+ # -- coding: utf-8
2
+
3
+ require "spec_helper"
4
+
5
+ describe Autopagerize do
6
+ let(:siteinfo) do
7
+ json = MultiJson.dump [
8
+ {
9
+ :data => {
10
+ :url => "^http://foo/",
11
+ :nextLink => "nomatch",
12
+ },
13
+ },
14
+ {
15
+ :data => {
16
+ :url => "^http://bar/",
17
+ :pageElement => "//page",
18
+ :nextLink => "//a",
19
+ }
20
+ },
21
+ {
22
+ :data => {
23
+ :url => "^http://baz/",
24
+ :pageElement => "//foobar",
25
+ :nextLink => "//a",
26
+ }
27
+ },
28
+ ]
29
+ MultiJson.load(json)
30
+ end
31
+
32
+ it "should autopagerize" do
33
+ first = <<-HTML
34
+ <!doctype html>
35
+ <page>first page</page>
36
+ <a href="next.html">next link</a>
37
+ HTML
38
+
39
+ second = <<-HTML
40
+ <!doctype html>
41
+ <page>next.html</page>
42
+ <a href="morenext.html">go to final page</a>
43
+ HTML
44
+
45
+ third = <<-HTML
46
+ <!doctype html>
47
+ <page>final page</page>
48
+ HTML
49
+
50
+ client = HTTPClient.new
51
+ client.test_loopback_response << first << second << third
52
+ a = Autopagerize.new("http://bar/", siteinfo, :httpclient => client)
53
+ a.nextlink.should == "http://bar/next.html"
54
+ a.to_enum.to_a.length.should == 3
55
+ end
56
+ end
@@ -0,0 +1,14 @@
1
+ # -- coding: utf-8
2
+
3
+ require "rubygems"
4
+ require "bundler/setup"
5
+ Bundler.require :default, :test, :development
6
+ require "rspec-expectations"
7
+ require "rspec/matchers/built_in/be"
8
+
9
+ Dir["./spec/support/**/*.rb"].each{|file| require file }
10
+
11
+ require File.expand_path("../../lib/autopagerize.rb", __FILE__)
12
+
13
+ RSpec.configure do |config|
14
+ end
metadata ADDED
@@ -0,0 +1,138 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: autopagerize
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - uu59
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-11 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: addressable
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: httpclient
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rspec
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: multi_json
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Concat paginated web pages to single page
95
+ email:
96
+ - k@uu59.org
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - .gitignore
102
+ - Gemfile
103
+ - LICENSE.txt
104
+ - README.md
105
+ - Rakefile
106
+ - autopagerize.gemspec
107
+ - lib/autopagerize.rb
108
+ - lib/autopagerize/version.rb
109
+ - spec/autopagerize_spec.rb
110
+ - spec/spec_helper.rb
111
+ homepage: ''
112
+ licenses: []
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ none: false
119
+ requirements:
120
+ - - ! '>='
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ none: false
125
+ requirements:
126
+ - - ! '>='
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 1.8.23
132
+ signing_key:
133
+ specification_version: 3
134
+ summary: Concat paginated web pages to single page
135
+ test_files:
136
+ - spec/autopagerize_spec.rb
137
+ - spec/spec_helper.rb
138
+ has_rdoc: