pulse-downloader 0.1.31 → 0.1.35
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -0
- data/lib/pulse/downloader/client.rb +12 -5
- data/lib/pulse/downloader/version.rb +1 -1
- data/lib/pulse/downloader/web_page_parser.rb +42 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: df940062ab2c0dcdc96b256b76e8198683776d7bdb834198c0f75da85d5e63fb
|
4
|
+
data.tar.gz: 158f5746ceb0d820934e126cae7ca53dfca0a841ae8a588b2b955c85de5abc7f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2494b72940a92e4b13fc44c8c9c81ff63e48a35940e9bcc6437235afdd4c1ca2fdc43098b71f976ce42d5fa65b53979aa3b055b511c91c1ca6b0ff64a0dd472c
|
7
|
+
data.tar.gz: 1c40920e57fdbb4184034cc91953f82c5babf1c09146027b9b887b1bac5d8b45f4e386b21133bd44625f8adb8843daac8b4652e35e94ebf25188738d02d8a36c
|
data/README.md
CHANGED
@@ -29,6 +29,7 @@ client = Pulse::Downloader::Client.new(
|
|
29
29
|
save_data: true,
|
30
30
|
save_path: '',
|
31
31
|
read_from_save_path: false,
|
32
|
+
traverse_folders: false,
|
32
33
|
verify_ssl: true,
|
33
34
|
drop_exitsing_files_in_path: false,
|
34
35
|
save_and_dont_return: true,
|
@@ -38,6 +39,8 @@ client = Pulse::Downloader::Client.new(
|
|
38
39
|
)
|
39
40
|
|
40
41
|
client.call!
|
42
|
+
|
43
|
+
client.file_paths # Will give you the list of file paths
|
41
44
|
```
|
42
45
|
|
43
46
|
## Development
|
@@ -12,6 +12,7 @@ module Pulse
|
|
12
12
|
:save_data,
|
13
13
|
:save_path,
|
14
14
|
:read_from_save_path,
|
15
|
+
:traverse_folders,
|
15
16
|
:verify_ssl,
|
16
17
|
:headers,
|
17
18
|
:drop_exitsing_files_in_path,
|
@@ -20,7 +21,9 @@ module Pulse
|
|
20
21
|
:start_time,
|
21
22
|
:end_time,
|
22
23
|
:progress_bar,
|
23
|
-
:base_url
|
24
|
+
:base_url,
|
25
|
+
:file_paths,
|
26
|
+
:folder_urls
|
24
27
|
|
25
28
|
# Does not continue downloads-
|
26
29
|
# Will only save once the file has been downloaded in memory
|
@@ -28,7 +31,6 @@ module Pulse
|
|
28
31
|
# TODO: Validation
|
29
32
|
# TODO: Retry
|
30
33
|
# TODO: DNS
|
31
|
-
# TODO: Multiple filetypes
|
32
34
|
# TODO: lib/pulse/downloader/file_downloader.rb:13: warning: URI.escape is obsolete
|
33
35
|
def initialize(url:,
|
34
36
|
file_type:,
|
@@ -36,6 +38,7 @@ module Pulse
|
|
36
38
|
save_data: false,
|
37
39
|
save_path: '',
|
38
40
|
read_from_save_path: false,
|
41
|
+
traverse_folders: false,
|
39
42
|
verify_ssl: true,
|
40
43
|
headers: nil,
|
41
44
|
drop_exitsing_files_in_path: false,
|
@@ -49,6 +52,7 @@ module Pulse
|
|
49
52
|
@save_data = save_data
|
50
53
|
@save_path = save_path
|
51
54
|
@read_from_save_path = read_from_save_path
|
55
|
+
@traverse_folders = traverse_folders
|
52
56
|
@verify_ssl = verify_ssl
|
53
57
|
@headers = headers
|
54
58
|
@drop_exitsing_files_in_path = drop_exitsing_files_in_path
|
@@ -57,6 +61,7 @@ module Pulse
|
|
57
61
|
@progress_bar = progress_bar
|
58
62
|
|
59
63
|
@base_url = get_base_url
|
64
|
+
@folder_urls = []
|
60
65
|
end
|
61
66
|
|
62
67
|
def call!
|
@@ -66,12 +71,14 @@ module Pulse
|
|
66
71
|
def call
|
67
72
|
return false unless valid?
|
68
73
|
|
74
|
+
@file_paths = fetch_file_paths
|
75
|
+
|
69
76
|
if @progress_bar
|
70
|
-
@progress_bar = ::ProgressBar.new(
|
77
|
+
@progress_bar = ::ProgressBar.new(file_paths.size)
|
71
78
|
end
|
72
79
|
|
73
|
-
|
74
|
-
download(file_path, @progress_bar)
|
80
|
+
file_paths.map do |file_path|
|
81
|
+
download(file_path, @progress_bar) if save_data
|
75
82
|
@progress_bar.increment!
|
76
83
|
end
|
77
84
|
end
|
@@ -2,9 +2,38 @@ module Pulse
|
|
2
2
|
module Downloader
|
3
3
|
module WebPageParser
|
4
4
|
def fetch_file_paths(custom_path_root=nil)
|
5
|
+
if traverse_folders
|
6
|
+
fetch_folders(url).each do |folder_url|
|
7
|
+
fetch_and_parse_response(folder_url, custom_path_root)
|
8
|
+
end
|
9
|
+
else
|
10
|
+
fetch_and_parse_response(url, custom_path_root)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def fetch_folders(base_url)
|
17
|
+
current_paths = extract_hrefs(get_response(folder_url), custom_path_root)
|
18
|
+
return unless current_paths.compact.size > 0
|
19
|
+
|
20
|
+
@folder_urls = folder_urls.union(current_paths).uniq.compact
|
21
|
+
|
22
|
+
current_paths.each do |path|
|
23
|
+
fetch_folders(path)
|
24
|
+
end
|
25
|
+
|
26
|
+
folder_urls
|
27
|
+
end
|
28
|
+
|
29
|
+
def fetch_and_parse_response(folder_url, custom_path_root)
|
30
|
+
parse_response(get_response(folder_url), custom_path_root, file_type)
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_response(folder_url)
|
5
34
|
@start_time = get_micro_second_time
|
6
35
|
|
7
|
-
response = HTTParty.get(
|
36
|
+
response = HTTParty.get(folder_url, verify: verify_ssl, headers: headers)
|
8
37
|
|
9
38
|
@end_time = get_micro_second_time
|
10
39
|
|
@@ -12,6 +41,10 @@ module Pulse
|
|
12
41
|
print_time
|
13
42
|
end
|
14
43
|
|
44
|
+
response
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse_response(response, custom_path_root, file_type)
|
15
48
|
if file_type.is_a?(Array)
|
16
49
|
file_type.flat_map do |type|
|
17
50
|
extract_file_urls(response, custom_path_root, type)
|
@@ -21,8 +54,6 @@ module Pulse
|
|
21
54
|
end
|
22
55
|
end
|
23
56
|
|
24
|
-
private
|
25
|
-
|
26
57
|
def extract_file_urls(response, custom_path_root, type)
|
27
58
|
return [] if response.body.nil? || response.body.empty?
|
28
59
|
|
@@ -33,6 +64,14 @@ module Pulse
|
|
33
64
|
).uniq
|
34
65
|
end
|
35
66
|
|
67
|
+
def extract_hrefs(response, custom_path_root)
|
68
|
+
parse_html(response.body)
|
69
|
+
.css('a')
|
70
|
+
.map { |link| link['href'] }
|
71
|
+
.reject { |link| link == "../" }
|
72
|
+
.map { |link| add_base_url(link, custom_path_root) }
|
73
|
+
end
|
74
|
+
|
36
75
|
def extract_all_urls(response, custom_path_root, type)
|
37
76
|
parse_html(response.body)
|
38
77
|
.to_s
|