amajp_crawl 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/amajp_crawl.rb +121 -0
  3. metadata +43 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 6911c11b2c19a7251e2ffce7548debf321b2b9376dfc3dcf6cd33188f24c49f3
4
+ data.tar.gz: 4c11bec39d6eaff44cbece13bd34cc7021e4c87fd5c3f9990b65c5bd30f1e927
5
+ SHA512:
6
+ metadata.gz: b9c50a5a976072a3f6ef141db547767f5d7836c98f79238b8c5538b3b81188e8cb04bfc1ab611ba9ae2447aee2fc0659b1611db1804ed01754629c18f7d69f35
7
+ data.tar.gz: ff1edeb870edd2a4db9bc9177c553237e822dc5672bcdb354df73fe25a6d7a29d773e567c0c74bd1b1e5ecebd2f7af0aefc2ce170490fe7136a157111ea09f22
@@ -0,0 +1,121 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'csv'
4
+ require 'thread'
5
+ require 'pry'
6
+
7
+ class Notice
8
+
9
+ def insert_keyword
10
+ "= = = = = = = = = = INSERT KEYWORD = = = = = = = = = = = = = = = ="
11
+ end
12
+
13
+ def insert_last_page
14
+ "= = = = = = = INSERT LAST PAGE = = = MAX IS 21 PAGE = = = = = = = "
15
+ end
16
+
17
+ def nil_keyword
18
+ "= = = = = = = = = = KEYWORD NILL = = = = = = = = = = = = = = = = ="
19
+ end
20
+
21
+ def not_availability_last_page
22
+ "= = = = = = = = = = LAST PAGE NOT AVAILABILITY = = = = = = = = = ="
23
+ end
24
+
25
+ def complete_get_asin
26
+ "= = = = = = = = = = COMPLETE GET DATA ASIN = = = = = = = = = = = ="
27
+ end
28
+
29
+ def insert_to_csv
30
+ "= = = = = = = = = = INSERT TO CSV FILE = = = = = = = = = = = = = ="
31
+ end
32
+
33
+ def complete
34
+ "= = = = = = = = = = COMPLETE = = = = = = = = = = = = = = = = = = ="
35
+ end
36
+
37
+ def loading
38
+ "== "
39
+ end
40
+
41
+ def error
42
+ "= = = = = = = = = = ERROR = = = TRY AGAIN LATER = = = = = = = = = "
43
+ end
44
+ end
45
+
46
+ class InsertData < Notice
47
+ attr_accessor :keyword, :last_page
48
+
49
+ def initialize
50
+ puts insert_keyword
51
+ @keyword = gets.chomp.to_s
52
+ puts insert_last_page
53
+ @last_page = gets.chomp.to_i
54
+ end
55
+ end
56
+
57
+
58
+ class GetdataAsin
59
+ attr_accessor :urls
60
+
61
+ def initialize(insert_data)
62
+ $retries = 3
63
+ $completed = true
64
+ @urls = []
65
+ $total_asin = []
66
+ insert_data.last_page.times {|i| urls.push("https://www.amazon.co.jp/s?__mk_ja_JP=%E3%82%AB%E3%82%BF%E3%82%AB%E3%83%8A&i=aps&k=#{insert_data.keyword}&page=#{i+1}&ref=nb_sb_noss&url=search-alias%3Daps")}
67
+ end
68
+ end
69
+
70
+ insert_data = InsertData.new
71
+ get_data = GetdataAsin.new(insert_data)
72
+
73
+ begin
74
+ class Parser < Notice
75
+
76
+ def initialize(insert_data)
77
+ ## Function
78
+ insert_data.urls.each { |url|
79
+ raw_page = Nokogiri::HTML(open(url))
80
+ items = raw_page.css("#a-page > #search > .sg-row > .sg-col > .sg-col-inner > .rush-component > .s-result-list .s-result-item")
81
+ asin_array = []
82
+ items.each do |item|
83
+ asin = item.attr("data-asin")
84
+ unless asin == nil
85
+ asin_array << asin
86
+ end
87
+ end
88
+ $total_asin = $total_asin + asin_array
89
+ print loading
90
+ }
91
+ puts complete_get_asin
92
+ puts insert_to_csv
93
+ end
94
+ end
95
+
96
+ parser_data = Parser.new(get_data)
97
+
98
+ rescue => exception
99
+ puts exception
100
+ puts "Wait for 3 minutes and Retry"
101
+ sleep 180 # 3 minutes
102
+ $retries -= 1
103
+ retry if $retries > 0
104
+ $completed = false
105
+ puts "#{exception}, TRY AGAIN LATER"
106
+ end
107
+
108
+ class InsertCsv < Notice
109
+ def initialize(insert_data)
110
+ CSV.open("#{insert_data.keyword}_asin_amazonjp.csv", 'wb') do |csv|
111
+ $total_asin.each do |item|
112
+ csv << [item, "\n"]
113
+ end
114
+ end
115
+ current_directory = Dir.pwd + "/" + "#{insert_data.keyword}_asin_amazonjp.csv"
116
+ return puts error if $completed == false
117
+ puts complete
118
+ puts "CHECK #{insert_data.keyword} ASIN AT #{current_directory}"
119
+ end
120
+ end
121
+ InsertCsv.new(insert_data)
metadata ADDED
@@ -0,0 +1,43 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: amajp_crawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Le Hoang Lan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-07-07 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: lehoanglan97k31@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/amajp_crawl.rb
20
+ homepage: http://rubygems.org/gems/amajp_crawl
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubygems_version: 3.0.3
40
+ signing_key:
41
+ specification_version: 4
42
+ summary: Crawl AsinID any production from Amazonjp via Terminal
43
+ test_files: []