aws-whitepaper-downloader 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1945c635d5d084efc5fa70451a474a9d4486b4de
4
+ data.tar.gz: f8dcd349cc8c967fd2f375103a463b4cea15ba96
5
+ SHA512:
6
+ metadata.gz: 444f896f2c746ba7e44359c93bc79b1a043f66eac57c7052a6c590261c6cae0f9ce2ae6dcf56d4fc2e5cb2b21b6834e1ca6cc5aafdf0124c23c438f98e4fe15f
7
+ data.tar.gz: d069f3a3f9e522d2aa9db8d5e1de8bbcc5cff3f2bc1b569bedfa66162dd6818c631d87a3893e6cc7b3340984e909858a08430030afd9ae0f85a942a20af6494d
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/aws_whitepaper_downloader.rb'
4
+ require "aws_whitepaper_downloader/downloader"
5
+ require 'commander/import'
6
+ require "fileutils"
7
+
8
+ program :name, 'Aws White Paper Downloader'
9
+ program :version, '1.0.0'
10
+ program :description, 'download all whitepaper sorted by category'
11
+
12
+ command :all do |c|
13
+ c.syntax = 'aws-whitepaper-downloader all'
14
+ c.description = ''
15
+ c.action do |args, options|
16
+ AwsWhitepaperDownloader.run( Dir.pwd )
17
+ end
18
+ end
@@ -0,0 +1,11 @@
1
+ require "aws_whitepaper_downloader/version"
2
+ require "aws_whitepaper_downloader/crawler"
3
+ require "aws_whitepaper_downloader/downloader"
4
+ require 'pry'
5
+
6
+ module AwsWhitepaperDownloader
7
+ def self.run( dir_path=nil )
8
+ links_hash = Crawler.new.run
9
+ Downloader.new.run( links_hash, dir_path )
10
+ end
11
+ end
@@ -0,0 +1,78 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+
4
+ module AwsWhitepaperDownloader
5
+ class Crawler
6
+
7
+ def initialize
8
+ end
9
+
10
+ def run
11
+ count = 0
12
+ start = nil
13
+ output = {}
14
+ flag = 0
15
+ current_hash = nil
16
+ pdfs = nil
17
+ title = nil
18
+ doc = Nokogiri::HTML( open("https://aws.amazon.com/whitepapers/"))
19
+ doc.css(".content > .title-wrapper, .content > .row-builder").each do |div|
20
+
21
+ # div == title-wrapper
22
+ if div['class'].match("title-wrapper")
23
+ title = parse_title( div )
24
+ output[title] ||= {}
25
+ current_hash = output[title]
26
+ end
27
+
28
+ # div == row-builder
29
+ next unless current_hash
30
+ next unless div['class'].match("row-builder")
31
+ # nested
32
+ if div.css('h3').size > 0
33
+ current_hash['type'] = 'nested'
34
+ sub_divs = div.css(
35
+ ".wrapper > .columns > .parsys > .columnbuilder,
36
+ .wrapper > .columns > .parsys > .title-wrapper"
37
+ )
38
+ sub_divs.each do |sub_div|
39
+ if subtitle = parse_subtitle( sub_div )
40
+ current_hash[subtitle] ||= {}
41
+ pdfs = current_hash[subtitle]
42
+ end
43
+ pdfs = parse_pdfs(sub_div, pdfs)
44
+ end
45
+ # normal
46
+ else
47
+ current_hash['type'] = 'normal'
48
+ pdfs = output[title]
49
+ pdfs = parse_pdfs(div, pdfs)
50
+ end
51
+
52
+ end
53
+ return output
54
+ end
55
+
56
+ private
57
+
58
+ def parse_title( title_wrapper )
59
+ return title_wrapper.css('h1').text.strip
60
+ end
61
+
62
+ def parse_subtitle( row_builder )
63
+ h3_title = row_builder.css('h3').text.strip
64
+ return h3_title.size > 0 ? h3_title : nil
65
+ end
66
+
67
+ def parse_pdfs( html, pdfs )
68
+ html.css('li').each do |x|
69
+ pdf_name = x.css('b').text()
70
+ pdf_url = "https:#{ x.css('a').first['href'] }" if x.css('a').first
71
+ pdfs[pdf_name] = pdf_url if pdf_name.size > 0 && pdf_url.size > 0
72
+ end
73
+ return pdfs
74
+ end
75
+
76
+ end
77
+
78
+ end
@@ -0,0 +1,68 @@
1
+ module AwsWhitepaperDownloader
2
+ class Downloader
3
+
4
+ WHITE_PAPPER_ROOT_DIR = "#{__dir__}/white pappers"
5
+
6
+ def initialize
7
+ end
8
+
9
+ def run( hash, download_path )
10
+ download_path = download_path ? "#{ download_path }/white pappers" : WHITE_PAPPER_ROOT_DIR
11
+ check_dir_created( nil, download_path )
12
+
13
+ hash.each_pair do |key, value|
14
+ current_dir = check_dir_created( key, download_path )
15
+ puts "---------------- current_dir -------------------"
16
+ puts current_dir
17
+ puts "-------------------------------------------------"
18
+ case value['type']
19
+ when 'normal'
20
+ Dir.chdir current_dir
21
+ puts "---------------- Dir.chdir curren_dir -------------------"
22
+ puts current_dir
23
+ puts "-------------------------------------------------"
24
+ puts 'normal'
25
+ hash.each_pair do |k1, v1|
26
+ next if k1 == 'type'
27
+ v1.each do |k2, v2|
28
+ next if k2 == 'type'
29
+ download( k2, v2,"#{current_dir}/#{v2}")
30
+ end
31
+ end
32
+ when 'nested'
33
+ puts 'nested'
34
+ value.each_pair do |k1, v1|
35
+ next if k1 == 'type'
36
+ nested_dir = check_dir_created( k1, current_dir )
37
+ Dir.chdir( nested_dir )
38
+ puts "---------------- nested_dir -------------------"
39
+ puts nested_dir
40
+ puts "------------ㄦ-----------------------------"
41
+
42
+ v1.each do |k2, v2|
43
+ next if k2 == 'type'
44
+ download( k2, v2,"#{nested_dir}/#{v2}")
45
+ end
46
+ end
47
+ end
48
+
49
+ end
50
+ end
51
+ # def run( download_dir )
52
+ # end
53
+
54
+ private
55
+
56
+ def download( file_name, path, download_dir )
57
+ unless File.exist? "#{file_name}.pdf"
58
+ `wget "#{file_name}.pdf" #{path}`
59
+ end
60
+ end
61
+ def check_dir_created( name, path )
62
+ location = "#{path}/#{name}"
63
+ Dir.mkdir location unless Dir.exist?( location )
64
+ return location
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,3 @@
1
+ module AwsWhitepaperDownloader
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: aws-whitepaper-downloader
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - StevenTTuD
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-05-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.14'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.14'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: commander
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Downloading all aws whitepapers by 1 command
70
+ email:
71
+ - StevenTTuD@gmail.com
72
+ executables:
73
+ - aws-whitepaper-downloader
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - bin/aws-whitepaper-downloader
78
+ - lib/aws_whitepaper_downloader.rb
79
+ - lib/aws_whitepaper_downloader/crawler.rb
80
+ - lib/aws_whitepaper_downloader/downloader.rb
81
+ - lib/aws_whitepaper_downloader/version.rb
82
+ homepage: https://github.com/StevenTTuD/aws-whitepaper-downloader
83
+ licenses: []
84
+ metadata:
85
+ allowed_push_host: https://rubygems.org
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ required_rubygems_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ requirements: []
101
+ rubyforge_project:
102
+ rubygems_version: 2.6.12
103
+ signing_key:
104
+ specification_version: 4
105
+ summary: Downloading all aws whitepapers by 1 command
106
+ test_files: []
107
+ has_rdoc: