amazon_dp 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +35 -0
- data/Rakefile +2 -0
- data/amazon_dp.gemspec +17 -0
- data/examples/fetch_and_parse.rb +23 -0
- data/examples/quick_fetch.rb +8 -0
- data/lib/amazon_dp/fetcher.rb +56 -0
- data/lib/amazon_dp/parser.rb +95 -0
- data/lib/amazon_dp/utils.rb +10 -0
- data/lib/amazon_dp/version.rb +3 -0
- data/lib/amazon_dp.rb +7 -0
- metadata +59 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 kimoto
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# AmazonDP
|
2
|
+
|
3
|
+
Amazon Description of Product page parser
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'amazon_dp'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install amazon_dp
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
#1
|
22
|
+
require 'amazon_dp'
|
23
|
+
page_info = AmazonDP.get("http://www.amazon.co.jp/dp/ASIN_CODE")
|
24
|
+
|
25
|
+
#2
|
26
|
+
require 'amazon_dp'
|
27
|
+
page_info = AmazonDP.get("ASIN_CODE")
|
28
|
+
|
29
|
+
#3
|
30
|
+
require 'amazon_dp'
|
31
|
+
f = AmazonDP::Fetcher.new
|
32
|
+
f.adult_auth
|
33
|
+
html = f.fetch("URL_OR_ASIN_CODE")
|
34
|
+
page_info = AmazonDP::Parser.new.parse_html(html)
|
35
|
+
|
data/Rakefile
ADDED
data/amazon_dp.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/amazon_dp/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["kimoto"]
|
6
|
+
gem.email = ["sub+peerler@gmail.com"]
|
7
|
+
gem.description = %q{Amazon Description of Product page parser}
|
8
|
+
gem.summary = %q{Amazon Description of Product page parser}
|
9
|
+
gem.homepage = ""
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "amazon_dp"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = AmazonDP::VERSION
|
17
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
# Author: kimoto
|
4
|
+
require 'amazon_dp'
|
5
|
+
|
6
|
+
AmazonDP::Fetcher.logger = Logger.new(STDERR)
|
7
|
+
AmazonDP::Parser.logger = Logger.new(STDERR)
|
8
|
+
|
9
|
+
fetcher = AmazonDP::Fetcher.new
|
10
|
+
fetcher.adult_auth
|
11
|
+
|
12
|
+
parser = AmazonDP::Parser.new
|
13
|
+
|
14
|
+
page_data = fetcher.fetch("http://www.amazon.co.jp/dp/4894714078")
|
15
|
+
page_info = parser.parse_html(page_data)
|
16
|
+
p page_info
|
17
|
+
|
18
|
+
page_data = fetcher.fetch("http://www.amazon.co.jp/dp/B00A40XOU4/")
|
19
|
+
p page_info = parser.parse_html(page_data)
|
20
|
+
|
21
|
+
page_data = fetcher.fetch("B00A3EXJP6")
|
22
|
+
p page_info = parser.parse_html(page_data)
|
23
|
+
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
# Author: kimoto
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'mechanize'
|
6
|
+
require 'logger'
|
7
|
+
|
8
|
+
module AmazonDP
|
9
|
+
class Fetcher
|
10
|
+
ADULT_AUTH_URL = "http://www.amazon.co.jp/gp/product/black-curtain-redirect.html"
|
11
|
+
PERMALINK_URL = "http://www.amazon.co.jp/gp/product/[ASIN_CODE]?ie=UTF8&redirect=true"
|
12
|
+
USER_AGENT = "w3m/0.5.2"
|
13
|
+
|
14
|
+
@@logger = Logger.new(nil)
|
15
|
+
def self.logger=(logger)
|
16
|
+
@@logger = logger
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(opts={})
|
20
|
+
@user_agent = opts[:user_agent] ? opts[:user_agent] : USER_AGENT
|
21
|
+
@adult_auth_url = opts[:adult_auth_url] ? opts[:adult_auth_url] : ADULT_AUTH_URL
|
22
|
+
@permalink_url = opts[:permalink_url] ? opts[:permalink_url] : PERMALINK_URL
|
23
|
+
@agent = Mechanize.new{ |agent|
|
24
|
+
agent.user_agent = @user_agent
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def adult_auth
|
29
|
+
@@logger.info "try adult authentication"
|
30
|
+
@agent.redirect_ok = false
|
31
|
+
page = @agent.get(@adult_auth_url)
|
32
|
+
@agent.redirect_ok = true
|
33
|
+
@@logger.info "adult authentication done"
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch(*params)
|
37
|
+
if params == /^https?/
|
38
|
+
fetch_url(*params)
|
39
|
+
else
|
40
|
+
fetch_asin_code(*params)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
def fetch_asin_code(asin_code)
|
46
|
+
fetch_url @permalink_url.clone.gsub("[ASIN_CODE]", asin_code)
|
47
|
+
end
|
48
|
+
|
49
|
+
def fetch_url(url)
|
50
|
+
@@logger.info "try fetch information: #{url}"
|
51
|
+
page = @agent.get(url)
|
52
|
+
@@logger.info "fetched"
|
53
|
+
page.body
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'logger'
|
5
|
+
|
6
|
+
module AmazonDP
|
7
|
+
class AmazonDPError < StandardError; end
|
8
|
+
|
9
|
+
class PageInfo
|
10
|
+
attr_accessor :is_kindle
|
11
|
+
attr_accessor :is_adult
|
12
|
+
attr_accessor :kindle_price
|
13
|
+
attr_accessor :kindle_pages
|
14
|
+
attr_accessor :stars
|
15
|
+
attr_accessor :reviews
|
16
|
+
attr_accessor :iine
|
17
|
+
def initialize(opts={})
|
18
|
+
opts.each do |k,v|
|
19
|
+
self.send("#{k}=", v)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Parser
|
25
|
+
class ParseError < AmazonDPError; end
|
26
|
+
class Amazon18xError < ParseError; end
|
27
|
+
|
28
|
+
@@logger = Logger.new(nil)
|
29
|
+
def self.logger=(logger)
|
30
|
+
@@logger = logger
|
31
|
+
end
|
32
|
+
|
33
|
+
def is_adult_notice_page?(doc)
|
34
|
+
return !doc.search("span.alert").empty?
|
35
|
+
end
|
36
|
+
|
37
|
+
def is_adult_product_page?(doc)
|
38
|
+
return doc.search("span.highlight").first.children.last.text.strip == "[アダルト]"
|
39
|
+
rescue
|
40
|
+
return false
|
41
|
+
end
|
42
|
+
|
43
|
+
def is_kindle_product_page?(doc)
|
44
|
+
return doc.search("table > tr > td > div.content > ul > li").first.children.last.text.strip == "Kindle版"
|
45
|
+
rescue
|
46
|
+
return false
|
47
|
+
end
|
48
|
+
|
49
|
+
def extract_kindle_price(doc)
|
50
|
+
doc.search(".priceLarge").text.strip.match(/[\d.]+$/).to_s.to_f
|
51
|
+
end
|
52
|
+
|
53
|
+
def extract_kindle_pages(doc)
|
54
|
+
doc.search("li.listItem > a#pageCountAvailable > span").text.strip.match(/^[\d.]+/).to_s.to_i
|
55
|
+
end
|
56
|
+
|
57
|
+
def extract_stars(doc)
|
58
|
+
doc.search("span.crAvgStars > span.asinReviewsSummary > a > span.swSprite")[0].attributes["title"].value.match(/[\d.]+$/).to_s.to_f
|
59
|
+
rescue
|
60
|
+
nil
|
61
|
+
end
|
62
|
+
|
63
|
+
def extract_reviews(doc)
|
64
|
+
doc.search("div.tiny > b").children.first.to_s.match(/^[\d.]+/).to_s.to_i
|
65
|
+
rescue
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
|
69
|
+
def extract_iine(doc)
|
70
|
+
doc.search("span.amazonLikeCountContainer > span").children.first.to_s.gsub(/,/, "").to_i
|
71
|
+
rescue
|
72
|
+
nil
|
73
|
+
end
|
74
|
+
|
75
|
+
def parse_html(html_data)
|
76
|
+
@@logger.info "try to parse html data"
|
77
|
+
html_data.encode("UTF-8", "cp932")
|
78
|
+
doc = Nokogiri::HTML(html_data)
|
79
|
+
if is_adult_notice_page?(doc)
|
80
|
+
@@logger.info "this page is adult amazon page"
|
81
|
+
raise Amazon18xError
|
82
|
+
end
|
83
|
+
kindle_flag = is_kindle_product_page?(doc)
|
84
|
+
return PageInfo.new(
|
85
|
+
:is_adult => is_adult_product_page?(doc),
|
86
|
+
:is_kindle => kindle_flag,
|
87
|
+
:kindle_price => (kindle_flag ? extract_kindle_price(doc) : nil),
|
88
|
+
:kindle_pages => (kindle_flag ? extract_kindle_pages(doc) : nil),
|
89
|
+
:stars => extract_stars(doc),
|
90
|
+
:reviews => extract_reviews(doc),
|
91
|
+
:iine => extract_iine(doc)
|
92
|
+
)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/amazon_dp.rb
ADDED
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: amazon_dp
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- kimoto
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-12-05 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Amazon Description of Product page parser
|
15
|
+
email:
|
16
|
+
- sub+peerler@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- Gemfile
|
23
|
+
- LICENSE
|
24
|
+
- README.md
|
25
|
+
- Rakefile
|
26
|
+
- amazon_dp.gemspec
|
27
|
+
- examples/fetch_and_parse.rb
|
28
|
+
- examples/quick_fetch.rb
|
29
|
+
- lib/amazon_dp.rb
|
30
|
+
- lib/amazon_dp/fetcher.rb
|
31
|
+
- lib/amazon_dp/parser.rb
|
32
|
+
- lib/amazon_dp/utils.rb
|
33
|
+
- lib/amazon_dp/version.rb
|
34
|
+
homepage: ''
|
35
|
+
licenses: []
|
36
|
+
post_install_message:
|
37
|
+
rdoc_options: []
|
38
|
+
require_paths:
|
39
|
+
- lib
|
40
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
47
|
+
none: false
|
48
|
+
requirements:
|
49
|
+
- - ! '>='
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '0'
|
52
|
+
requirements: []
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 1.8.24
|
55
|
+
signing_key:
|
56
|
+
specification_version: 3
|
57
|
+
summary: Amazon Description of Product page parser
|
58
|
+
test_files: []
|
59
|
+
has_rdoc:
|