amajp_crawl 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/amajp_crawl.rb +121 -0
- metadata +43 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6911c11b2c19a7251e2ffce7548debf321b2b9376dfc3dcf6cd33188f24c49f3
|
4
|
+
data.tar.gz: 4c11bec39d6eaff44cbece13bd34cc7021e4c87fd5c3f9990b65c5bd30f1e927
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b9c50a5a976072a3f6ef141db547767f5d7836c98f79238b8c5538b3b81188e8cb04bfc1ab611ba9ae2447aee2fc0659b1611db1804ed01754629c18f7d69f35
|
7
|
+
data.tar.gz: ff1edeb870edd2a4db9bc9177c553237e822dc5672bcdb354df73fe25a6d7a29d773e567c0c74bd1b1e5ecebd2f7af0aefc2ce170490fe7136a157111ea09f22
|
data/lib/amajp_crawl.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'csv'
|
4
|
+
require 'thread'
|
5
|
+
require 'pry'
|
6
|
+
|
7
|
+
class Notice
|
8
|
+
|
9
|
+
def insert_keyword
|
10
|
+
"= = = = = = = = = = INSERT KEYWORD = = = = = = = = = = = = = = = ="
|
11
|
+
end
|
12
|
+
|
13
|
+
def insert_last_page
|
14
|
+
"= = = = = = = INSERT LAST PAGE = = = MAX IS 21 PAGE = = = = = = = "
|
15
|
+
end
|
16
|
+
|
17
|
+
def nil_keyword
|
18
|
+
"= = = = = = = = = = KEYWORD NILL = = = = = = = = = = = = = = = = ="
|
19
|
+
end
|
20
|
+
|
21
|
+
def not_availability_last_page
|
22
|
+
"= = = = = = = = = = LAST PAGE NOT AVAILABILITY = = = = = = = = = ="
|
23
|
+
end
|
24
|
+
|
25
|
+
def complete_get_asin
|
26
|
+
"= = = = = = = = = = COMPLETE GET DATA ASIN = = = = = = = = = = = ="
|
27
|
+
end
|
28
|
+
|
29
|
+
def insert_to_csv
|
30
|
+
"= = = = = = = = = = INSERT TO CSV FILE = = = = = = = = = = = = = ="
|
31
|
+
end
|
32
|
+
|
33
|
+
def complete
|
34
|
+
"= = = = = = = = = = COMPLETE = = = = = = = = = = = = = = = = = = ="
|
35
|
+
end
|
36
|
+
|
37
|
+
def loading
|
38
|
+
"== "
|
39
|
+
end
|
40
|
+
|
41
|
+
def error
|
42
|
+
"= = = = = = = = = = ERROR = = = TRY AGAIN LATER = = = = = = = = = "
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class InsertData < Notice
|
47
|
+
attr_accessor :keyword, :last_page
|
48
|
+
|
49
|
+
def initialize
|
50
|
+
puts insert_keyword
|
51
|
+
@keyword = gets.chomp.to_s
|
52
|
+
puts insert_last_page
|
53
|
+
@last_page = gets.chomp.to_i
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
class GetdataAsin
|
59
|
+
attr_accessor :urls
|
60
|
+
|
61
|
+
def initialize(insert_data)
|
62
|
+
$retries = 3
|
63
|
+
$completed = true
|
64
|
+
@urls = []
|
65
|
+
$total_asin = []
|
66
|
+
insert_data.last_page.times {|i| urls.push("https://www.amazon.co.jp/s?__mk_ja_JP=%E3%82%AB%E3%82%BF%E3%82%AB%E3%83%8A&i=aps&k=#{insert_data.keyword}&page=#{i+1}&ref=nb_sb_noss&url=search-alias%3Daps")}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
insert_data = InsertData.new
|
71
|
+
get_data = GetdataAsin.new(insert_data)
|
72
|
+
|
73
|
+
begin
|
74
|
+
class Parser < Notice
|
75
|
+
|
76
|
+
def initialize(insert_data)
|
77
|
+
## Function
|
78
|
+
insert_data.urls.each { |url|
|
79
|
+
raw_page = Nokogiri::HTML(open(url))
|
80
|
+
items = raw_page.css("#a-page > #search > .sg-row > .sg-col > .sg-col-inner > .rush-component > .s-result-list .s-result-item")
|
81
|
+
asin_array = []
|
82
|
+
items.each do |item|
|
83
|
+
asin = item.attr("data-asin")
|
84
|
+
unless asin == nil
|
85
|
+
asin_array << asin
|
86
|
+
end
|
87
|
+
end
|
88
|
+
$total_asin = $total_asin + asin_array
|
89
|
+
print loading
|
90
|
+
}
|
91
|
+
puts complete_get_asin
|
92
|
+
puts insert_to_csv
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
parser_data = Parser.new(get_data)
|
97
|
+
|
98
|
+
rescue => exception
|
99
|
+
puts exception
|
100
|
+
puts "Wait for 3 minutes and Retry"
|
101
|
+
sleep 180 # 3 minutes
|
102
|
+
$retries -= 1
|
103
|
+
retry if $retries > 0
|
104
|
+
$completed = false
|
105
|
+
puts "#{exception}, TRY AGAIN LATER"
|
106
|
+
end
|
107
|
+
|
108
|
+
class InsertCsv < Notice
|
109
|
+
def initialize(insert_data)
|
110
|
+
CSV.open("#{insert_data.keyword}_asin_amazonjp.csv", 'wb') do |csv|
|
111
|
+
$total_asin.each do |item|
|
112
|
+
csv << [item, "\n"]
|
113
|
+
end
|
114
|
+
end
|
115
|
+
current_directory = Dir.pwd + "/" + "#{insert_data.keyword}_asin_amazonjp.csv"
|
116
|
+
return puts error if $completed == false
|
117
|
+
puts complete
|
118
|
+
puts "CHECK #{insert_data.keyword} ASIN AT #{current_directory}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
InsertCsv.new(insert_data)
|
metadata
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: amajp_crawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Le Hoang Lan
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-07-07 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email: lehoanglan97k31@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/amajp_crawl.rb
|
20
|
+
homepage: http://rubygems.org/gems/amajp_crawl
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubygems_version: 3.0.3
|
40
|
+
signing_key:
|
41
|
+
specification_version: 4
|
42
|
+
summary: Crawl AsinID any production from Amazonjp via Terminal
|
43
|
+
test_files: []
|