pulse-downloader 0.1.24 → 0.1.29

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 13ed97a9caedc58bca6c4247f2d56d7456aecafe0f2eecb65b6c25b5de404fd4
4
- data.tar.gz: 0d450684690c49812a63a0aec81f9e9d77e6074898a6ac2be9b7143fe5989adb
3
+ metadata.gz: ccce085a74c0c4f7710a6e3c7431b191e77bf313a4219ec08e8cd34e78e3340b
4
+ data.tar.gz: ad2307a1ed1d368fd53a9fa965d7e864a90a263dac689656c49e8a5f62484d9f
5
5
  SHA512:
6
- metadata.gz: '001790c43ba48c68ad1222d70f1352e5b399384fb7be69c6951ac80fd629ed2009fcc4a1292e23ab712ddaa1a9f206a59b1abd7f965625486160224e4e7a77bb'
7
- data.tar.gz: d49f98068eec42477b7069948f4a7c72b37b4e166c6f843b4e7e81763c27eb5e5561115f7b15800e48ed2cd86f5397e6a21f7d3eaf2816a91b017795399fbb9f
6
+ metadata.gz: 654608db5cd95af6aafde8741864bda84dda783348d0d85baf675418b51c65def47c2ce671910f4313a91c0899784ae92e0d4c6da8e43926fafeaf2053ee2c02
7
+ data.tar.gz: 520111b61ec241a70321f40707659c09317df3d2b950702be65758bb0a29e6c3bf7880fd7c61e57dafb4450ea2c05b51f834dcb3a2c9ad98ca06f3d61d5c327d
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- pulse-downloader (0.1.11)
4
+ pulse-downloader (0.1.26)
5
5
  active_attr (~> 0.15)
6
6
  httparty (~> 0.18)
7
7
  nokogiri (~> 1.11)
@@ -69,7 +69,7 @@ GEM
69
69
  ruby-progressbar
70
70
  mocha (1.11.2)
71
71
  multi_xml (0.6.0)
72
- nokogiri (1.11.3-x86_64-linux)
72
+ nokogiri (1.11.5-x86_64-linux)
73
73
  racc (~> 1.4)
74
74
  options (2.3.2)
75
75
  progress_bar (1.3.3)
data/README.md CHANGED
@@ -33,7 +33,8 @@ client = Pulse::Downloader::Client.new(
33
33
  drop_exitsing_files_in_path: false,
34
34
  save_and_dont_return: true,
35
35
  report_time: false,
36
- progress_bar: true
36
+ progress_bar: true,
37
+ scrape_images: true
37
38
  )
38
39
 
39
40
  client.call!
@@ -13,6 +13,7 @@ module Pulse
13
13
  :save_path,
14
14
  :read_from_save_path,
15
15
  :verify_ssl,
16
+ :headers,
16
17
  :drop_exitsing_files_in_path,
17
18
  :save_and_dont_return,
18
19
  :report_time,
@@ -36,6 +37,7 @@ module Pulse
36
37
  save_path: '',
37
38
  read_from_save_path: false,
38
39
  verify_ssl: true,
40
+ headers: nil,
39
41
  drop_exitsing_files_in_path: false,
40
42
  save_and_dont_return: true,
41
43
  report_time: false,
@@ -48,6 +50,7 @@ module Pulse
48
50
  @save_path = save_path
49
51
  @read_from_save_path = read_from_save_path
50
52
  @verify_ssl = verify_ssl
53
+ @headers = headers
51
54
  @drop_exitsing_files_in_path = drop_exitsing_files_in_path
52
55
  @save_and_dont_return = save_and_dont_return
53
56
  @report_time = report_time
@@ -8,7 +8,10 @@ module Pulse
8
8
 
9
9
  @start_time = get_micro_second_time
10
10
 
11
- file_data = HTTParty.get(escape(compute_file_link(file_path)), verify: verify_ssl)
11
+ file_data = HTTParty.get(
12
+ escape(compute_file_link(file_path)),
13
+ verify: verify_ssl
14
+ )
12
15
 
13
16
  @end_time = get_micro_second_time
14
17
 
@@ -1,5 +1,5 @@
1
1
  module Pulse
2
2
  module Downloader
3
- VERSION = "0.1.24"
3
+ VERSION = "0.1.29"
4
4
  end
5
5
  end
@@ -4,7 +4,7 @@ module Pulse
4
4
  def fetch_file_paths(custom_path_root=nil)
5
5
  @start_time = get_micro_second_time
6
6
 
7
- response = HTTParty.get(url, verify: verify_ssl)
7
+ response = HTTParty.get(url, verify: verify_ssl, headers: headers)
8
8
 
9
9
  @end_time = get_micro_second_time
10
10
 
@@ -25,23 +25,35 @@ module Pulse
25
25
 
26
26
  def extract_file_urls(response, custom_path_root, type)
27
27
  return [] if response.body.nil? || response.body.empty?
28
- (
29
- extract_download_links(response, custom_path_root, type) +
30
- extract_embedded_images(response, custom_path_root, type)
28
+
29
+ remove_base64(
30
+ extract_all_urls(response, custom_path_root, type) +
31
+ extract_download_links(response, type) +
32
+ extract_embedded_images(response, type)
31
33
  ).uniq
32
34
  end
33
35
 
34
- def extract_download_links(response, custom_path_root, type)
36
+ def extract_all_urls(response, custom_path_root, type)
37
+ parse_html(response.body)
38
+ .to_s
39
+ .split(/\s+/)
40
+ .find_all { |u| u =~ /^https?:/ }
41
+ .compact
42
+ .select { |link| (link.include? type || link.include?(custom_path_root)) }
43
+ .map { |link| add_base_url(link, custom_path_root) }
44
+ end
45
+
46
+ def extract_download_links(response, type)
35
47
  parse_html(response.body)
36
48
  .css('a')
37
49
  .to_a
38
50
  .map { |link| link['href'] }
39
51
  .compact
40
- .select { |link| (link.include? type || link.include?(custom_path_root)) }
52
+ .select { |link| (link.include? type) }
41
53
  .map { |link| add_base_url(link) }
42
54
  end
43
55
 
44
- def extract_embedded_images(response, custom_path_root, type)
56
+ def extract_embedded_images(response, type)
45
57
  return [] unless scrape_images
46
58
 
47
59
  parse_html(response.body)
@@ -49,15 +61,34 @@ module Pulse
49
61
  .to_a
50
62
  .map { |e| e["src"] }
51
63
  .compact
52
- .select { |link| (link.include? type || link.include?(custom_path_root)) }
64
+ .select { |link| (link.include? type) }
53
65
  .map { |link| add_base_url(link) }
54
66
  end
55
67
 
68
+ def remove_artefacts(urls)
69
+ urls = remove_extra_escape_characters(urls)
70
+ remove_base64(urls)
71
+ end
72
+
73
+ def remove_extra_escape_characters(urls)
74
+ urls.map do |url|
75
+ url.gsub("\">", '')
76
+ end
77
+ end
78
+
79
+ def remove_base64(urls)
80
+ urls.reject do |url|
81
+ url.include?(':image/') || url.include?('base64')
82
+ end
83
+ end
84
+
56
85
  def parse_html(raw_html)
57
86
  Nokogiri::HTML(raw_html)
58
87
  end
59
88
 
60
- def add_base_url(str)
89
+ def add_base_url(str, custom_path_root=nil)
90
+ return str if custom_path_root
91
+
61
92
  if !str.include?('https://') && !str.include?(base_url)
62
93
  "https://#{base_url}#{str}"
63
94
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pulse-downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.24
4
+ version: 0.1.29
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-10 00:00:00.000000000 Z
11
+ date: 2021-05-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: httparty