aws-whitepaper-downloader 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1945c635d5d084efc5fa70451a474a9d4486b4de
4
+ data.tar.gz: f8dcd349cc8c967fd2f375103a463b4cea15ba96
5
+ SHA512:
6
+ metadata.gz: 444f896f2c746ba7e44359c93bc79b1a043f66eac57c7052a6c590261c6cae0f9ce2ae6dcf56d4fc2e5cb2b21b6834e1ca6cc5aafdf0124c23c438f98e4fe15f
7
+ data.tar.gz: d069f3a3f9e522d2aa9db8d5e1de8bbcc5cff3f2bc1b569bedfa66162dd6818c631d87a3893e6cc7b3340984e909858a08430030afd9ae0f85a942a20af6494d
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/aws_whitepaper_downloader.rb'
4
+ require "aws_whitepaper_downloader/downloader"
5
+ require 'commander/import'
6
+ require "fileutils"
7
+
8
+ program :name, 'Aws White Paper Downloader'
9
+ program :version, '1.0.0'
10
+ program :description, 'download all whitepaper sorted by category'
11
+
12
+ command :all do |c|
13
+ c.syntax = 'aws-whitepaper-downloader all'
14
+ c.description = ''
15
+ c.action do |args, options|
16
+ AwsWhitepaperDownloader.run( Dir.pwd )
17
+ end
18
+ end
@@ -0,0 +1,11 @@
1
+ require "aws_whitepaper_downloader/version"
2
+ require "aws_whitepaper_downloader/crawler"
3
+ require "aws_whitepaper_downloader/downloader"
4
+ require 'pry'
5
+
6
+ module AwsWhitepaperDownloader
7
+ def self.run( dir_path=nil )
8
+ links_hash = Crawler.new.run
9
+ Downloader.new.run( links_hash, dir_path )
10
+ end
11
+ end
@@ -0,0 +1,78 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+
4
+ module AwsWhitepaperDownloader
5
+ class Crawler
6
+
7
+ def initialize
8
+ end
9
+
10
+ def run
11
+ count = 0
12
+ start = nil
13
+ output = {}
14
+ flag = 0
15
+ current_hash = nil
16
+ pdfs = nil
17
+ title = nil
18
+ doc = Nokogiri::HTML( open("https://aws.amazon.com/whitepapers/"))
19
+ doc.css(".content > .title-wrapper, .content > .row-builder").each do |div|
20
+
21
+ # div == title-wrapper
22
+ if div['class'].match("title-wrapper")
23
+ title = parse_title( div )
24
+ output[title] ||= {}
25
+ current_hash = output[title]
26
+ end
27
+
28
+ # div == row-builder
29
+ next unless current_hash
30
+ next unless div['class'].match("row-builder")
31
+ # nested
32
+ if div.css('h3').size > 0
33
+ current_hash['type'] = 'nested'
34
+ sub_divs = div.css(
35
+ ".wrapper > .columns > .parsys > .columnbuilder,
36
+ .wrapper > .columns > .parsys > .title-wrapper"
37
+ )
38
+ sub_divs.each do |sub_div|
39
+ if subtitle = parse_subtitle( sub_div )
40
+ current_hash[subtitle] ||= {}
41
+ pdfs = current_hash[subtitle]
42
+ end
43
+ pdfs = parse_pdfs(sub_div, pdfs)
44
+ end
45
+ # normal
46
+ else
47
+ current_hash['type'] = 'normal'
48
+ pdfs = output[title]
49
+ pdfs = parse_pdfs(div, pdfs)
50
+ end
51
+
52
+ end
53
+ return output
54
+ end
55
+
56
+ private
57
+
58
+ def parse_title( title_wrapper )
59
+ return title_wrapper.css('h1').text.strip
60
+ end
61
+
62
+ def parse_subtitle( row_builder )
63
+ h3_title = row_builder.css('h3').text.strip
64
+ return h3_title.size > 0 ? h3_title : nil
65
+ end
66
+
67
+ def parse_pdfs( html, pdfs )
68
+ html.css('li').each do |x|
69
+ pdf_name = x.css('b').text()
70
+ pdf_url = "https:#{ x.css('a').first['href'] }" if x.css('a').first
71
+ pdfs[pdf_name] = pdf_url if pdf_name.size > 0 && pdf_url.size > 0
72
+ end
73
+ return pdfs
74
+ end
75
+
76
+ end
77
+
78
+ end
@@ -0,0 +1,68 @@
1
+ module AwsWhitepaperDownloader
2
+ class Downloader
3
+
4
+ WHITE_PAPPER_ROOT_DIR = "#{__dir__}/white pappers"
5
+
6
+ def initialize
7
+ end
8
+
9
+ def run( hash, download_path )
10
+ download_path = download_path ? "#{ download_path }/white pappers" : WHITE_PAPPER_ROOT_DIR
11
+ check_dir_created( nil, download_path )
12
+
13
+ hash.each_pair do |key, value|
14
+ current_dir = check_dir_created( key, download_path )
15
+ puts "---------------- current_dir -------------------"
16
+ puts current_dir
17
+ puts "-------------------------------------------------"
18
+ case value['type']
19
+ when 'normal'
20
+ Dir.chdir current_dir
21
+ puts "---------------- Dir.chdir curren_dir -------------------"
22
+ puts current_dir
23
+ puts "-------------------------------------------------"
24
+ puts 'normal'
25
+ hash.each_pair do |k1, v1|
26
+ next if k1 == 'type'
27
+ v1.each do |k2, v2|
28
+ next if k2 == 'type'
29
+ download( k2, v2,"#{current_dir}/#{v2}")
30
+ end
31
+ end
32
+ when 'nested'
33
+ puts 'nested'
34
+ value.each_pair do |k1, v1|
35
+ next if k1 == 'type'
36
+ nested_dir = check_dir_created( k1, current_dir )
37
+ Dir.chdir( nested_dir )
38
+ puts "---------------- nested_dir -------------------"
39
+ puts nested_dir
40
+ puts "------------ㄦ-----------------------------"
41
+
42
+ v1.each do |k2, v2|
43
+ next if k2 == 'type'
44
+ download( k2, v2,"#{nested_dir}/#{v2}")
45
+ end
46
+ end
47
+ end
48
+
49
+ end
50
+ end
51
+ # def run( download_dir )
52
+ # end
53
+
54
+ private
55
+
56
+ def download( file_name, path, download_dir )
57
+ unless File.exist? "#{file_name}.pdf"
58
+ `wget "#{file_name}.pdf" #{path}`
59
+ end
60
+ end
61
+ def check_dir_created( name, path )
62
+ location = "#{path}/#{name}"
63
+ Dir.mkdir location unless Dir.exist?( location )
64
+ return location
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,3 @@
1
+ module AwsWhitepaperDownloader
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: aws-whitepaper-downloader
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - StevenTTuD
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-05-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.14'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.14'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: commander
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Downloading all aws whitepapers by 1 command
70
+ email:
71
+ - StevenTTuD@gmail.com
72
+ executables:
73
+ - aws-whitepaper-downloader
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - bin/aws-whitepaper-downloader
78
+ - lib/aws_whitepaper_downloader.rb
79
+ - lib/aws_whitepaper_downloader/crawler.rb
80
+ - lib/aws_whitepaper_downloader/downloader.rb
81
+ - lib/aws_whitepaper_downloader/version.rb
82
+ homepage: https://github.com/StevenTTuD/aws-whitepaper-downloader
83
+ licenses: []
84
+ metadata:
85
+ allowed_push_host: https://rubygems.org
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ required_rubygems_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ requirements: []
101
+ rubyforge_project:
102
+ rubygems_version: 2.6.12
103
+ signing_key:
104
+ specification_version: 4
105
+ summary: Downloading all aws whitepapers by 1 command
106
+ test_files: []
107
+ has_rdoc: