tomosia_amanaplus_crawl 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -9
- data/lib/tomosia_amanaplus_crawl.rb +7 -8
- data/lib/tomosia_amanaplus_crawl/cli.rb +1 -1
- data/lib/tomosia_amanaplus_crawl/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78adb558a362c9594df8a864dad75bf4bc95472eb64e2aa9ef1499d2a39f6837
|
4
|
+
data.tar.gz: d2d670dbc023aa1f1ce265f78245b18a2177da373b0e4436a8a88ced97f9677b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 382535d1072a6803ffd0166ee70b99187514b9119656e94c852519ef3c608ed6d829a86d19d4d8a20027a6fc1e1a3fe909defc4644e139002998751a40bb124c
|
7
|
+
data.tar.gz: d18803def75f4efa16e4e4339e3f0b1e64a6a54f77c4a8a6b45d5cb8991e6a3e5a87a0c4d1047c22ad8fdab4266c1c3dedac07660663aeb6557624266e6c6809
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
tomosia_amanaplus_crawl (0.
|
5
|
-
httparty (= 0.18.1)
|
4
|
+
tomosia_amanaplus_crawl (0.2.3)
|
6
5
|
nokogiri (= 1.10.10)
|
7
6
|
spreadsheet (= 1.2.6)
|
8
7
|
thor
|
@@ -11,14 +10,7 @@ GEM
|
|
11
10
|
remote: https://rubygems.org/
|
12
11
|
specs:
|
13
12
|
diff-lcs (1.4.4)
|
14
|
-
httparty (0.18.1)
|
15
|
-
mime-types (~> 3.0)
|
16
|
-
multi_xml (>= 0.5.2)
|
17
|
-
mime-types (3.3.1)
|
18
|
-
mime-types-data (~> 3.2015)
|
19
|
-
mime-types-data (3.2020.0512)
|
20
13
|
mini_portile2 (2.4.0)
|
21
|
-
multi_xml (0.6.0)
|
22
14
|
nokogiri (1.10.10)
|
23
15
|
mini_portile2 (~> 2.4.0)
|
24
16
|
rake (12.3.3)
|
@@ -15,11 +15,11 @@ module TomosiaAmanaplusCrawl
|
|
15
15
|
|
16
16
|
pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
|
17
17
|
images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
|
18
|
-
|
18
|
+
|
19
19
|
# lấy tổng số image
|
20
20
|
total = parsed_page.css("h1.p-search-result__ttl").text.split(' ').first
|
21
21
|
total = total[(6 + keyword.length)..(total.length - 1)].chop.chop.chop.gsub(',', '').to_i
|
22
|
-
if max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
22
|
+
if max == nil || max > total # nếu max lớn hơn total thì max = total => vẫn lấy hết
|
23
23
|
max = total
|
24
24
|
end
|
25
25
|
|
@@ -34,11 +34,11 @@ module TomosiaAmanaplusCrawl
|
|
34
34
|
curr_index = 1
|
35
35
|
while curr_page <= pages
|
36
36
|
puts "Crawling page #{curr_page}..........."
|
37
|
-
|
37
|
+
|
38
38
|
pagination_unparsed_page = open("#{URL}/#{keyword}?page=#{curr_page}").read
|
39
39
|
pagination_parsed_page = Nokogiri::HTML(pagination_unparsed_page)
|
40
40
|
pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
|
41
|
-
|
41
|
+
|
42
42
|
pagination_images_listings.each do |img|
|
43
43
|
if curr_index > max
|
44
44
|
return images
|
@@ -54,7 +54,7 @@ module TomosiaAmanaplusCrawl
|
|
54
54
|
images << current_image
|
55
55
|
curr_index += 1
|
56
56
|
end
|
57
|
-
|
57
|
+
|
58
58
|
curr_page += 1
|
59
59
|
end
|
60
60
|
images
|
@@ -75,7 +75,7 @@ module TomosiaAmanaplusCrawl
|
|
75
75
|
File.open("#{path}/#{curr_image[:url].split('/').last}", "a+") do |file|
|
76
76
|
file.write(image.read) # lưu hình ảnh
|
77
77
|
curr_image[:size] = image.size # cập nhật lại size trong mảng images
|
78
|
-
print
|
78
|
+
print '.'
|
79
79
|
end
|
80
80
|
end # end open
|
81
81
|
rescue => exception
|
@@ -99,10 +99,9 @@ module TomosiaAmanaplusCrawl
|
|
99
99
|
book = Spreadsheet::Workbook.new
|
100
100
|
sheet1 = book.create_worksheet
|
101
101
|
|
102
|
-
i = 0
|
103
102
|
sheet1.row(0).concat %w{Title Url Size(bytes) Extension}
|
104
103
|
puts "Writing..........."
|
105
|
-
images.
|
104
|
+
images.each_with_index do |img, i|
|
106
105
|
sheet1.row(i += 1).push img[:title], img[:url], img[:size], img[:extension]
|
107
106
|
end
|
108
107
|
puts "Writed."
|
@@ -8,7 +8,7 @@ module TomosiaAmanaplusCrawl
|
|
8
8
|
option :destination
|
9
9
|
option :max
|
10
10
|
def crawl(keyword)
|
11
|
-
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, options[:destination], options[:max].to_i)
|
11
|
+
TomosiaAmanaplusCrawl::Crawler.new.run(keyword, options[:destination], options[:max] == nil ? nil : options[:max].to_i)
|
12
12
|
end
|
13
13
|
end
|
14
14
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_amanaplus_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nhat Huy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|