tomosia_amanaplus_crawl 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +6 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +21 -0
- data/README.md +45 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/tomosia_amanaplus_crawl.rb +92 -0
- data/lib/tomosia_amanaplus_crawl/version.rb +3 -0
- data/tomosia_amanaplus_crawl.gemspec +17 -0
- metadata +19 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1daa69b29114aa1cd35172183b838b1dec522be2cc796449f6cfcb93c279a7de
|
4
|
+
data.tar.gz: 1f656912c20d6e924ac40a0bc87f60b5e3c01762ccc2d2f4c72c202e23acfad0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 86f3eeed3acbe50e039ac50b1359f9b8894354d966dcf84321465c1584789bb77444acc40d608380e53b7873c265699f5e5501f44883b0178ab95baf61b87ee6
|
7
|
+
data.tar.gz: 5556e3ea60053e307dc1ef5f08412220046cf07fe63ad018d99d4f82a9e756bc66a9539e3c0c190ac0de8f6a3e6cdeb94b783bdb3f2c4730300bac5e70fb9973
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
2
|
+
|
3
|
+
## Our Pledge
|
4
|
+
|
5
|
+
In the interest of fostering an open and welcoming environment, we as
|
6
|
+
contributors and maintainers pledge to making participation in our project and
|
7
|
+
our community a harassment-free experience for everyone, regardless of age, body
|
8
|
+
size, disability, ethnicity, gender identity and expression, level of experience,
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity and
|
10
|
+
orientation.
|
11
|
+
|
12
|
+
## Our Standards
|
13
|
+
|
14
|
+
Examples of behavior that contributes to creating a positive environment
|
15
|
+
include:
|
16
|
+
|
17
|
+
* Using welcoming and inclusive language
|
18
|
+
* Being respectful of differing viewpoints and experiences
|
19
|
+
* Gracefully accepting constructive criticism
|
20
|
+
* Focusing on what is best for the community
|
21
|
+
* Showing empathy towards other community members
|
22
|
+
|
23
|
+
Examples of unacceptable behavior by participants include:
|
24
|
+
|
25
|
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26
|
+
advances
|
27
|
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28
|
+
* Public or private harassment
|
29
|
+
* Publishing others' private information, such as a physical or electronic
|
30
|
+
address, without explicit permission
|
31
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
32
|
+
professional setting
|
33
|
+
|
34
|
+
## Our Responsibilities
|
35
|
+
|
36
|
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37
|
+
behavior and are expected to take appropriate and fair corrective action in
|
38
|
+
response to any instances of unacceptable behavior.
|
39
|
+
|
40
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
41
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44
|
+
threatening, offensive, or harmful.
|
45
|
+
|
46
|
+
## Scope
|
47
|
+
|
48
|
+
This Code of Conduct applies both within project spaces and in public spaces
|
49
|
+
when an individual is representing the project or its community. Examples of
|
50
|
+
representing a project or community include using an official project e-mail
|
51
|
+
address, posting via an official social media account, or acting as an appointed
|
52
|
+
representative at an online or offline event. Representation of a project may be
|
53
|
+
further defined and clarified by project maintainers.
|
54
|
+
|
55
|
+
## Enforcement
|
56
|
+
|
57
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
58
|
+
reported by contacting the project team at tt.huy.dang@tomosia.com. All
|
59
|
+
complaints will be reviewed and investigated and will result in a response that
|
60
|
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61
|
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62
|
+
Further details of specific enforcement policies may be posted separately.
|
63
|
+
|
64
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65
|
+
faith may face temporary or permanent repercussions as determined by other
|
66
|
+
members of the project's leadership.
|
67
|
+
|
68
|
+
## Attribution
|
69
|
+
|
70
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71
|
+
available at [https://contributor-covenant.org/version/1/4][version]
|
72
|
+
|
73
|
+
[homepage]: https://contributor-covenant.org
|
74
|
+
[version]: https://contributor-covenant.org/version/1/4/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2020 tthuydang
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# TomosiaAmanaplusCrawl
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/tomosia_amanaplus_crawl`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'tomosia_amanaplus_crawl'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle install
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install tomosia_amanaplus_crawl
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
TODO: Write usage instructions here
|
26
|
+
|
27
|
+
## Development
|
28
|
+
|
29
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
|
+
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/tomosia_amanaplus_crawl. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/tomosia_amanaplus_crawl/blob/master/CODE_OF_CONDUCT.md).
|
36
|
+
|
37
|
+
|
38
|
+
## License
|
39
|
+
|
40
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
41
|
+
|
42
|
+
## Code of Conduct
|
43
|
+
|
44
|
+
Everyone interacting in the TomosiaAmanaplusCrawl project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/tomosia_amanaplus_crawl/blob/master/CODE_OF_CONDUCT.md).
|
45
|
+
# tomosia_amanaplus_crawl
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "tomosia_amanaplus_crawl"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
require "tomosia_amanaplus_crawl/version"
|
2
|
+
|
3
|
+
module TomosiaAmanaplusCrawl
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'httparty'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'fileutils'
|
8
|
+
require 'spreadsheet'
|
9
|
+
|
10
|
+
class Crawler
|
11
|
+
URL = "https://plus.amanaimages.com/items/search/"
|
12
|
+
|
13
|
+
def run(keyword, destination)
|
14
|
+
unparsed_page = HTTParty.get("#{URL}/#{keyword}")
|
15
|
+
parsed_page = Nokogiri::HTML(unparsed_page)
|
16
|
+
|
17
|
+
pages = parsed_page.css("div.c-paginate__nums").css('a').last.text.to_i # tổng số page
|
18
|
+
images_listings = parsed_page.css("div.p-search-result__body") # danh sách các thẻ div chứa image
|
19
|
+
|
20
|
+
images = getPaginationImages(images_listings, pages, keyword)
|
21
|
+
downloadImages(images, destination)
|
22
|
+
writeToExcel(images, destination)
|
23
|
+
end
|
24
|
+
|
25
|
+
def getPaginationImages(images_listings, pages, keyword) # lấy tất cả image của các page cộng lại
|
26
|
+
images = Array.new
|
27
|
+
curr_page = 1
|
28
|
+
while curr_page <= pages
|
29
|
+
puts "Crawling page #{curr_page}..........."
|
30
|
+
|
31
|
+
pagination_unparsed_page = HTTParty.get("https://plus.amanaimages.com/items/search/#{keyword}?page=#{curr_page}")
|
32
|
+
pagination_parsed_page = Nokogiri::HTML(pagination_unparsed_page)
|
33
|
+
pagination_images_listings = pagination_parsed_page.css("div.p-item-thumb")
|
34
|
+
|
35
|
+
pagination_images_listings.each do |img|
|
36
|
+
src = img.css('img').attr('data-src').nil? == true ? img.css('img').attr('src') : img.css('img').attr('data-src')
|
37
|
+
current_image = {
|
38
|
+
title: img.css('a')[1].attr('title'),
|
39
|
+
url: src.to_s,
|
40
|
+
size: 'unknow',
|
41
|
+
extension: ".#{src.to_s.split('.').last}"
|
42
|
+
}
|
43
|
+
images << current_image
|
44
|
+
end
|
45
|
+
|
46
|
+
curr_page += 1
|
47
|
+
end
|
48
|
+
images
|
49
|
+
end
|
50
|
+
|
51
|
+
# tải hình và cập nhật lại size
|
52
|
+
def downloadImages(images, destination)
|
53
|
+
path = "#{destination}/Downloads" # lưu hình ở folder Downloads
|
54
|
+
Dir.mkdir path unless File.exist? path
|
55
|
+
|
56
|
+
threads = []
|
57
|
+
print "\nDownloading"
|
58
|
+
images.each do |curr_image|
|
59
|
+
threads << Thread.new(curr_image) {
|
60
|
+
open(curr_image[:url]) do |image|
|
61
|
+
File.open("#{path}/#{curr_image[:url].split('/').last}", "a+") do |file|
|
62
|
+
file.write(image.read) # lưu hình ảnh
|
63
|
+
curr_image[:size] = image.size # cập nhật lại size trong mảng images
|
64
|
+
print "."
|
65
|
+
end
|
66
|
+
end # end open
|
67
|
+
}
|
68
|
+
end
|
69
|
+
threads.each { |t| t.join }
|
70
|
+
puts "\nDownloaded."
|
71
|
+
end
|
72
|
+
|
73
|
+
def writeToExcel(images, destination)
|
74
|
+
path = "#{destination}/File Excel" # lưu file ở folder File Excel
|
75
|
+
Dir.mkdir path unless File.exist? path
|
76
|
+
|
77
|
+
book = Spreadsheet::Workbook.new
|
78
|
+
sheet1 = book.create_worksheet
|
79
|
+
|
80
|
+
i = 0
|
81
|
+
sheet1.row(0).concat %w{Title Url Size(bytes) Extension}
|
82
|
+
puts "Writing..........."
|
83
|
+
images.each do |img|
|
84
|
+
sheet1.row(i += 1).push img[:title], img[:url], img[:size], img[:extension]
|
85
|
+
end
|
86
|
+
puts "Writed."
|
87
|
+
|
88
|
+
book.write "#{path}/YeuNgucLep.xls"
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require_relative 'lib/tomosia_amanaplus_crawl/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = "tomosia_amanaplus_crawl"
|
5
|
+
spec.version = TomosiaAmanaplusCrawl::VERSION
|
6
|
+
spec.authors = "Nhat Huy"
|
7
|
+
|
8
|
+
spec.summary = %q{tomosia_amanaplus_crawl demo project crawl du lieu.}
|
9
|
+
spec.homepage = "https://github.com/tthuydang/tomosia_amanaplus_crawl"
|
10
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
11
|
+
|
12
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
13
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
14
|
+
end
|
15
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
16
|
+
spec.require_paths = ["lib"]
|
17
|
+
end
|
metadata
CHANGED
@@ -1,24 +1,36 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tomosia_amanaplus_crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nhat Huy
|
8
8
|
autorequire:
|
9
|
-
bindir:
|
9
|
+
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
date: 2020-08-06 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
14
|
-
email:
|
13
|
+
description:
|
14
|
+
email:
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
|
-
files:
|
18
|
+
files:
|
19
|
+
- ".gitignore"
|
20
|
+
- ".rspec"
|
21
|
+
- ".travis.yml"
|
22
|
+
- CODE_OF_CONDUCT.md
|
23
|
+
- Gemfile
|
24
|
+
- LICENSE.txt
|
25
|
+
- README.md
|
26
|
+
- Rakefile
|
27
|
+
- bin/console
|
28
|
+
- bin/setup
|
29
|
+
- lib/tomosia_amanaplus_crawl.rb
|
30
|
+
- lib/tomosia_amanaplus_crawl/version.rb
|
31
|
+
- tomosia_amanaplus_crawl.gemspec
|
19
32
|
homepage: https://github.com/tthuydang/tomosia_amanaplus_crawl
|
20
|
-
licenses:
|
21
|
-
- MIT
|
33
|
+
licenses: []
|
22
34
|
metadata: {}
|
23
35
|
post_install_message:
|
24
36
|
rdoc_options: []
|