tomosia_amanaplus_crawl 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -9
- data/lib/tomosia_amanaplus_crawl.rb +7 -8
- data/lib/tomosia_amanaplus_crawl/cli.rb +1 -1
- data/lib/tomosia_amanaplus_crawl/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78adb558a362c9594df8a864dad75bf4bc95472eb64e2aa9ef1499d2a39f6837
|
4
|
+
data.tar.gz: d2d670dbc023aa1f1ce265f78245b18a2177da373b0e4436a8a88ced97f9677b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 382535d1072a6803ffd0166ee70b99187514b9119656e94c852519ef3c608ed6d829a86d19d4d8a20027a6fc1e1a3fe909defc4644e139002998751a40bb124c
|
7
|
+
data.tar.gz: d18803def75f4efa16e4e4339e3f0b1e64a6a54f77c4a8a6b45d5cb8991e6a3e5a87a0c4d1047c22ad8fdab4266c1c3dedac07660663aeb6557624266e6c6809
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tomosia_amanaplus_crawl (0.
|
5
|
-
httparty (= 0.18.1)
|
4
|
+
tomosia_amanaplus_crawl (0.2.3)
|
6
5
|
nokogiri (= 1.10.10)
|
7
6
|
spreadsheet (= 1.2.6)
|
8
7
|
thor
|
@@ -11,14 +10,7 @@ GEM
|
|
11
10
|
remote: https://rubygems.org/
|
12
11
|
specs:
|
13
12
|
diff-lcs (1.4.4)
|
14
|
-
httparty (0.18.1)
|
15
|
-
mime-types (~> 3.0)
|
16
|
-
multi_xml (>= 0.5.2)
|
17
|
-
mime-types (3.3.1)
|
18
|
-
mime-types-data (~> 3.2015)
|
19
|
-
mime-types-data (3.2020.0512)
|
20
13
|
mini_portile2 (2.4.0)
|
21
|
-
multi_xml (0.6.0)
|
22
14
|
nokogiri (1.10.10)
|
23
15
|
mini_portile2 (~> 2.4.0)
|
24
16
|
rake (12.3.3)
|
@@ -15,11 +15,11 @@ module TomosiaAmanaplusCrawl
|
|
15
15
|
|
16
16
|
pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
|
17
17
|
images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
|
18
|
-
|
18
|
+
|
19
19
|
# lấy tổng số image
|
20
20
|
total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
|
21
21
|
total = total[(6 + keyword.length)..(total.length - 1)].chop.chop.chop.gsub(',', '').to_i
|
22
|
-
if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
22
|
+
if max == nil || max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
23
23
|
max = total
|
24
24
|
end
|
25
25
|
|
@@ -34,11 +34,11 @@ module TomosiaAmanaplusCrawl
|
|
34
34
|
curr_index = 1
|
35
35
|
while curr_page <= pages
|
36
36
|
puts "Crawling page #{curr_page}..........."
|
37
|
-
|
37
|
+
|
38
38
|
pagination_unparsed_page = open("#{URL}/#{keyword}?page=#{curr_page}").read
|
39
39
|
pagination_parsed_page = Nokogiri::HTML(pagination_unparsed_page)
|
40
40
|
pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
|
41
|
-
|
41
|
+
|
42
42
|
pagination_images_listings.each do |img|
|
43
43
|
if curr_index > max
|
44
44
|
return images
|
@@ -54,7 +54,7 @@ module TomosiaAmanaplusCrawl
|
|
54
54
|
images << current_image
|
55
55
|
curr_index += 1
|
56
56
|
end
|
57
|
-
|
57
|
+
|
58
58
|
curr_page += 1
|
59
59
|
end
|
60
60
|
images
|
@@ -75,7 +75,7 @@ module TomosiaAmanaplusCrawl
|
|
75
75
|
File.open("#{path}/#{curr_image[:url].split('/').last}", "a+") do |file|
|
76
76
|
file.write(image.read) # lưu hình ảnh
|
77
77
|
curr_image[:size] = image.size # cập nhật lại size trong mảng images
|
78
|
-
print
|
78
|
+
print '.'
|
79
79
|
end
|
80
80
|
end # end open
|
81
81
|
rescue => exception
|
@@ -99,10 +99,9 @@ module TomosiaAmanaplusCrawl
|
|
99
99
|
book = Spreadsheet::Workbook.new
|
100
100
|
sheet1 = book.create_worksheet
|
101
101
|
|
102
|
-
i = 0
|
103
102
|
sheet1.row(0).concat %w{Title Url Size(bytes) Extension}
|
104
103
|
puts "Writing..........."
|
105
|
-
images.
|
104
|
+
images.each_with_index do |img, i|
|
106
105
|
sheet1.row(i += 1).push img[:title], img[:url], img[:size], img[:extension]
|
107
106
|
end
|
108
107
|
puts "Writed."
|
@@ -8,7 +8,7 @@ module TomosiaAmanaplusCrawl
|
|
8
8
|
option :destination
|
9
9
|
option :max
|
10
10
|
def crawl(keyword)
|
11
|
-
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, options[:destination], options[:max].to_i)
|
11
|
+
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, options[:destination], options[:max] == nil ? nil : options[:max].to_i)
|
12
12
|
end
|
13
13
|
end
|
14
14
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_amanaplus_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nhat Huy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|