amazon_dp 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in amazon_dp.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 kimoto
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # AmazonDP
2
+
3
+ Amazon Description of Product page parser
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'amazon_dp'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install amazon_dp
18
+
19
+ ## Usage
20
+
21
+ #1
22
+ require 'amazon_dp'
23
+ page_info = AmazonDP.get("http://www.amazon.co.jp/dp/ASIN_CODE")
24
+
25
+ #2
26
+ require 'amazon_dp'
27
+ page_info = AmazonDP.get("ASIN_CODE")
28
+
29
+ #3
30
+ require 'amazon_dp'
31
+ f = AmazonDP::Fetcher.new
32
+ f.adult_auth
33
+ html = f.fetch("URL_OR_ASIN_CODE")
34
+ page_info = AmazonDP::Parser.new.parse_html(html)
35
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
data/amazon_dp.gemspec ADDED
@@ -0,0 +1,17 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/amazon_dp/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["kimoto"]
6
+ gem.email = ["sub+peerler@gmail.com"]
7
+ gem.description = %q{Amazon Description of Product page parser}
8
+ gem.summary = %q{Amazon Description of Product page parser}
9
+ gem.homepage = ""
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "amazon_dp"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = AmazonDP::VERSION
17
+ end
@@ -0,0 +1,23 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+ # Author: kimoto
4
+ require 'amazon_dp'
5
+
6
+ AmazonDP::Fetcher.logger = Logger.new(STDERR)
7
+ AmazonDP::Parser.logger = Logger.new(STDERR)
8
+
9
+ fetcher = AmazonDP::Fetcher.new
10
+ fetcher.adult_auth
11
+
12
+ parser = AmazonDP::Parser.new
13
+
14
+ page_data = fetcher.fetch("http://www.amazon.co.jp/dp/4894714078")
15
+ page_info = parser.parse_html(page_data)
16
+ p page_info
17
+
18
+ page_data = fetcher.fetch("http://www.amazon.co.jp/dp/B00A40XOU4/")
19
+ p page_info = parser.parse_html(page_data)
20
+
21
+ page_data = fetcher.fetch("B00A3EXJP6")
22
+ p page_info = parser.parse_html(page_data)
23
+
@@ -0,0 +1,8 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+ # Author: kimoto
4
+ require 'amazon_dp'
5
+
6
+ page_info = AmazonDP.get("B00A3EXJP6")
7
+ p page_info
8
+
@@ -0,0 +1,56 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+ # Author: kimoto
4
+ require 'nokogiri'
5
+ require 'mechanize'
6
+ require 'logger'
7
+
8
+ module AmazonDP
9
+ class Fetcher
10
+ ADULT_AUTH_URL = "http://www.amazon.co.jp/gp/product/black-curtain-redirect.html"
11
+ PERMALINK_URL = "http://www.amazon.co.jp/gp/product/[ASIN_CODE]?ie=UTF8&redirect=true"
12
+ USER_AGENT = "w3m/0.5.2"
13
+
14
+ @@logger = Logger.new(nil)
15
+ def self.logger=(logger)
16
+ @@logger = logger
17
+ end
18
+
19
+ def initialize(opts={})
20
+ @user_agent = opts[:user_agent] ? opts[:user_agent] : USER_AGENT
21
+ @adult_auth_url = opts[:adult_auth_url] ? opts[:adult_auth_url] : ADULT_AUTH_URL
22
+ @permalink_url = opts[:permalink_url] ? opts[:permalink_url] : PERMALINK_URL
23
+ @agent = Mechanize.new{ |agent|
24
+ agent.user_agent = @user_agent
25
+ }
26
+ end
27
+
28
+ def adult_auth
29
+ @@logger.info "try adult authentication"
30
+ @agent.redirect_ok = false
31
+ page = @agent.get(@adult_auth_url)
32
+ @agent.redirect_ok = true
33
+ @@logger.info "adult authentication done"
34
+ end
35
+
36
+ def fetch(*params)
37
+ if params == /^https?/
38
+ fetch_url(*params)
39
+ else
40
+ fetch_asin_code(*params)
41
+ end
42
+ end
43
+
44
+ private
45
+ def fetch_asin_code(asin_code)
46
+ fetch_url @permalink_url.clone.gsub("[ASIN_CODE]", asin_code)
47
+ end
48
+
49
+ def fetch_url(url)
50
+ @@logger.info "try fetch information: #{url}"
51
+ page = @agent.get(url)
52
+ @@logger.info "fetched"
53
+ page.body
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,95 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+ require 'nokogiri'
4
+ require 'logger'
5
+
6
+ module AmazonDP
7
+ class AmazonDPError < StandardError; end
8
+
9
+ class PageInfo
10
+ attr_accessor :is_kindle
11
+ attr_accessor :is_adult
12
+ attr_accessor :kindle_price
13
+ attr_accessor :kindle_pages
14
+ attr_accessor :stars
15
+ attr_accessor :reviews
16
+ attr_accessor :iine
17
+ def initialize(opts={})
18
+ opts.each do |k,v|
19
+ self.send("#{k}=", v)
20
+ end
21
+ end
22
+ end
23
+
24
+ class Parser
25
+ class ParseError < AmazonDPError; end
26
+ class Amazon18xError < ParseError; end
27
+
28
+ @@logger = Logger.new(nil)
29
+ def self.logger=(logger)
30
+ @@logger = logger
31
+ end
32
+
33
+ def is_adult_notice_page?(doc)
34
+ return !doc.search("span.alert").empty?
35
+ end
36
+
37
+ def is_adult_product_page?(doc)
38
+ return doc.search("span.highlight").first.children.last.text.strip == "[アダルト]"
39
+ rescue
40
+ return false
41
+ end
42
+
43
+ def is_kindle_product_page?(doc)
44
+ return doc.search("table > tr > td > div.content > ul > li").first.children.last.text.strip == "Kindle版"
45
+ rescue
46
+ return false
47
+ end
48
+
49
+ def extract_kindle_price(doc)
50
+ doc.search(".priceLarge").text.strip.match(/[\d.]+$/).to_s.to_f
51
+ end
52
+
53
+ def extract_kindle_pages(doc)
54
+ doc.search("li.listItem > a#pageCountAvailable > span").text.strip.match(/^[\d.]+/).to_s.to_i
55
+ end
56
+
57
+ def extract_stars(doc)
58
+ doc.search("span.crAvgStars > span.asinReviewsSummary > a > span.swSprite")[0].attributes["title"].value.match(/[\d.]+$/).to_s.to_f
59
+ rescue
60
+ nil
61
+ end
62
+
63
+ def extract_reviews(doc)
64
+ doc.search("div.tiny > b").children.first.to_s.match(/^[\d.]+/).to_s.to_i
65
+ rescue
66
+ nil
67
+ end
68
+
69
+ def extract_iine(doc)
70
+ doc.search("span.amazonLikeCountContainer > span").children.first.to_s.gsub(/,/, "").to_i
71
+ rescue
72
+ nil
73
+ end
74
+
75
+ def parse_html(html_data)
76
+ @@logger.info "try to parse html data"
77
+ html_data.encode("UTF-8", "cp932")
78
+ doc = Nokogiri::HTML(html_data)
79
+ if is_adult_notice_page?(doc)
80
+ @@logger.info "this page is adult amazon page"
81
+ raise Amazon18xError
82
+ end
83
+ kindle_flag = is_kindle_product_page?(doc)
84
+ return PageInfo.new(
85
+ :is_adult => is_adult_product_page?(doc),
86
+ :is_kindle => kindle_flag,
87
+ :kindle_price => (kindle_flag ? extract_kindle_price(doc) : nil),
88
+ :kindle_pages => (kindle_flag ? extract_kindle_pages(doc) : nil),
89
+ :stars => extract_stars(doc),
90
+ :reviews => extract_reviews(doc),
91
+ :iine => extract_iine(doc)
92
+ )
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,10 @@
1
+ #!/bin/env ruby
2
+
3
+ module AmazonDP
4
+ def self.get(url_or_asin)
5
+ f = Fetcher.new
6
+ f.adult_auth
7
+ Parser.new.parse_html f.fetch(url_or_asin)
8
+ end
9
+ end
10
+
@@ -0,0 +1,3 @@
1
+ module AmazonDP
2
+ VERSION = "0.0.1"
3
+ end
data/lib/amazon_dp.rb ADDED
@@ -0,0 +1,7 @@
1
+ require "amazon_dp/version"
2
+ require 'amazon_dp/fetcher'
3
+ require 'amazon_dp/parser'
4
+ require 'amazon_dp/utils'
5
+
6
+ module AmazonDP
7
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: amazon_dp
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - kimoto
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-05 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Amazon Description of Product page parser
15
+ email:
16
+ - sub+peerler@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - Gemfile
23
+ - LICENSE
24
+ - README.md
25
+ - Rakefile
26
+ - amazon_dp.gemspec
27
+ - examples/fetch_and_parse.rb
28
+ - examples/quick_fetch.rb
29
+ - lib/amazon_dp.rb
30
+ - lib/amazon_dp/fetcher.rb
31
+ - lib/amazon_dp/parser.rb
32
+ - lib/amazon_dp/utils.rb
33
+ - lib/amazon_dp/version.rb
34
+ homepage: ''
35
+ licenses: []
36
+ post_install_message:
37
+ rdoc_options: []
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ none: false
48
+ requirements:
49
+ - - ! '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ requirements: []
53
+ rubyforge_project:
54
+ rubygems_version: 1.8.24
55
+ signing_key:
56
+ specification_version: 3
57
+ summary: Amazon Description of Product page parser
58
+ test_files: []
59
+ has_rdoc: