pulse-downloader 0.1.21 → 0.1.26

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: db7034531cb0590f5975013cb817d4093d603d93a7d89d16d099203c063125f9
4
- data.tar.gz: e6277a95389810ebcd11957645318daf6829d05d44bea2e73bd2b05fcdda56d1
3
+ metadata.gz: b59437d8b4d75cbb7c68010644dbb046f6860e9c37187aefad578e4316111e82
4
+ data.tar.gz: fcc8cd9883129f60bd427c51c3420b7176a3347a0f8f87939669f7cf82d78966
5
5
  SHA512:
6
- metadata.gz: 72109c3a8509f1da9f1d2c25c1958d8e5cc897e03224dda26ce6ae5da50ac96c754ec502fa44a2bbb8a1ae99fe34b30841f6eb2c3dfa64181f32b3b6c9866e9c
7
- data.tar.gz: 780396541173f9fd8e1ce8044522bc876a3b55123adf169a8a62161252bf3e5905f09b38741c4faa70a064a8a89f01192005ff5a4ad924725b83a5b31c9ae9e1
6
+ metadata.gz: 81611fe51bce4a86cbeccb01362eaf8b1e8e3cc9a618b92206c46cc3afb875779e92962c18582b19df593941f68bac9726f7854902fb811ef97e2edabbd682f4
7
+ data.tar.gz: 916859633e6ac45c882e5c3ebce09834f4e1f0318fc3eba927549bea54be0f714255fb41f0c5b715f9a4a051a1219aeed7b89ddbc2b14600d0f98c3d79e6947b
data/README.md CHANGED
@@ -33,7 +33,8 @@ client = Pulse::Downloader::Client.new(
33
33
  drop_exitsing_files_in_path: false,
34
34
  save_and_dont_return: true,
35
35
  report_time: false,
36
- progress_bar: true
36
+ progress_bar: true,
37
+ scrape_images: true
37
38
  )
38
39
 
39
40
  client.call!
@@ -1,5 +1,5 @@
1
1
  module Pulse
2
2
  module Downloader
3
- VERSION = "0.1.21"
3
+ VERSION = "0.1.26"
4
4
  end
5
5
  end
@@ -1,7 +1,7 @@
1
1
  module Pulse
2
2
  module Downloader
3
3
  module WebPageParser
4
- def fetch_file_paths
4
+ def fetch_file_paths(custom_path_root=nil)
5
5
  @start_time = get_micro_second_time
6
6
 
7
7
  response = HTTParty.get(url, verify: verify_ssl)
@@ -12,29 +12,47 @@ module Pulse
12
12
  print_time
13
13
  end
14
14
 
15
- extract_file_urls(response)
15
+ if file_type.is_a?(Array)
16
+ file_type.flat_map do |type|
17
+ extract_file_urls(response, custom_path_root, type)
18
+ end
19
+ else
20
+ extract_file_urls(response, custom_path_root, file_type)
21
+ end
16
22
  end
17
23
 
18
24
  private
19
25
 
20
- def extract_file_urls(response)
26
+ def extract_file_urls(response, custom_path_root, type)
21
27
  return [] if response.body.nil? || response.body.empty?
22
- (
23
- extract_download_links(response) + extract_embedded_images(response)
28
+ remove_base64(
29
+ extract_all_urls(response, custom_path_root, type) +
30
+ extract_download_links(response, custom_path_root, type) +
31
+ extract_embedded_images(response, custom_path_root, type)
24
32
  ).uniq
25
33
  end
26
34
 
27
- def extract_download_links(response)
35
+ def extract_all_urls(response, custom_path_root, type)
36
+ parse_html(response.body)
37
+ .to_s
38
+ .split(/\s+/)
39
+ .find_all { |u| u =~ /^https?:/ }
40
+ .compact
41
+ .select { |link| (link.include? type || link.include?(custom_path_root)) }
42
+ .map { |link| add_base_url(link) }
43
+ end
44
+
45
+ def extract_download_links(response, custom_path_root, type)
28
46
  parse_html(response.body)
29
47
  .css('a')
30
48
  .to_a
31
49
  .map { |link| link['href'] }
32
50
  .compact
33
- .select { |link| link.include? file_type }
51
+ .select { |link| (link.include? type || link.include?(custom_path_root)) }
34
52
  .map { |link| add_base_url(link) }
35
53
  end
36
54
 
37
- def extract_embedded_images(response)
55
+ def extract_embedded_images(response, custom_path_root, type)
38
56
  return [] unless scrape_images
39
57
 
40
58
  parse_html(response.body)
@@ -42,16 +60,22 @@ module Pulse
42
60
  .to_a
43
61
  .map { |e| e["src"] }
44
62
  .compact
45
- .select { |link| link.include? file_type }
63
+ .select { |link| (link.include? type || link.include?(custom_path_root)) }
46
64
  .map { |link| add_base_url(link) }
47
65
  end
48
66
 
67
+ def remove_base64(urls)
68
+ urls.reject do |url|
69
+ url.include?(':image/') || url.include?('base64')
70
+ end
71
+ end
72
+
49
73
  def parse_html(raw_html)
50
74
  Nokogiri::HTML(raw_html)
51
75
  end
52
76
 
53
77
  def add_base_url(str)
54
- if !str.include?('https://') || !str.include?(base_url)
78
+ if !str.include?('https://') && !str.include?(base_url)
55
79
  "https://#{base_url}#{str}"
56
80
  else
57
81
  str
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pulse-downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.21
4
+ version: 0.1.26
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-10 00:00:00.000000000 Z
11
+ date: 2021-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: httparty