pulse-downloader 0.1.25 → 0.1.30

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 48153054f979c1f1ed2e36c211edb598f921b41e6f5e0b71bfd8368eaa4112f3
4
- data.tar.gz: 1c831b86a9ead79b51446ed0b911ee54d3b410437d20214fddb57da808a3938a
3
+ metadata.gz: 5b1f82d865e48f194bbbbad697b5049c0a40ede8b74442032dfe6812b75e7ad9
4
+ data.tar.gz: e8f100c954fc76f4f6f32a262ebe353b9a68b7e468c0269cb0ee718f91a7d18c
5
5
  SHA512:
6
- metadata.gz: f0a0e5d9dbeca80d2de42ae1e655191e9aa01c557992b141412cd1dfe86751b3e047b951fc2f6bbf28ea487b7bc8f80d1d173d76c129ca38194cda34f894ef7a
7
- data.tar.gz: cf2a9d907e5f35a792c4d9b7f89ea0e0a96b50de499ade69b62afe0108857a7324836021b3ba338d22c05744789b8e252db08a1ae36177f3a5f671003bee6a32
6
+ metadata.gz: 8e1ce2a8e64b23bf74c36c39a2d9112940eab4778da968582b33dc3818c8803ff8d4a4c6e849cc741cc4f6d6ddf4a778e537173a8c5cfaa09ad51baf18549e86
7
+ data.tar.gz: f996baddaf407802ef415dbc8177c08d8845d007eaf24e9e67ca0aaa8fed770ac1c4c71deaf4632998cc0e8890e23dac4363e9ab77054b670b2cb071db4d2065
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- pulse-downloader (0.1.11)
4
+ pulse-downloader (0.1.26)
5
5
  active_attr (~> 0.15)
6
6
  httparty (~> 0.18)
7
7
  nokogiri (~> 1.11)
@@ -69,7 +69,7 @@ GEM
69
69
  ruby-progressbar
70
70
  mocha (1.11.2)
71
71
  multi_xml (0.6.0)
72
- nokogiri (1.11.3-x86_64-linux)
72
+ nokogiri (1.11.5-x86_64-linux)
73
73
  racc (~> 1.4)
74
74
  options (2.3.2)
75
75
  progress_bar (1.3.3)
data/README.md CHANGED
@@ -33,7 +33,8 @@ client = Pulse::Downloader::Client.new(
33
33
  drop_exitsing_files_in_path: false,
34
34
  save_and_dont_return: true,
35
35
  report_time: false,
36
- progress_bar: true
36
+ progress_bar: true,
37
+ scrape_images: true
37
38
  )
38
39
 
39
40
  client.call!
@@ -13,6 +13,7 @@ module Pulse
13
13
  :save_path,
14
14
  :read_from_save_path,
15
15
  :verify_ssl,
16
+ :headers,
16
17
  :drop_exitsing_files_in_path,
17
18
  :save_and_dont_return,
18
19
  :report_time,
@@ -36,6 +37,7 @@ module Pulse
36
37
  save_path: '',
37
38
  read_from_save_path: false,
38
39
  verify_ssl: true,
40
+ headers: nil,
39
41
  drop_exitsing_files_in_path: false,
40
42
  save_and_dont_return: true,
41
43
  report_time: false,
@@ -48,6 +50,7 @@ module Pulse
48
50
  @save_path = save_path
49
51
  @read_from_save_path = read_from_save_path
50
52
  @verify_ssl = verify_ssl
53
+ @headers = headers
51
54
  @drop_exitsing_files_in_path = drop_exitsing_files_in_path
52
55
  @save_and_dont_return = save_and_dont_return
53
56
  @report_time = report_time
@@ -8,7 +8,10 @@ module Pulse
8
8
 
9
9
  @start_time = get_micro_second_time
10
10
 
11
- file_data = HTTParty.get(escape(compute_file_link(file_path)), verify: verify_ssl)
11
+ file_data = HTTParty.get(
12
+ escape(compute_file_link(file_path)),
13
+ verify: verify_ssl
14
+ )
12
15
 
13
16
  @end_time = get_micro_second_time
14
17
 
@@ -1,5 +1,5 @@
1
1
  module Pulse
2
2
  module Downloader
3
- VERSION = "0.1.25"
3
+ VERSION = "0.1.30"
4
4
  end
5
5
  end
@@ -4,7 +4,7 @@ module Pulse
4
4
  def fetch_file_paths(custom_path_root=nil)
5
5
  @start_time = get_micro_second_time
6
6
 
7
- response = HTTParty.get(url, verify: verify_ssl)
7
+ response = HTTParty.get(url, verify: verify_ssl, headers: headers)
8
8
 
9
9
  @end_time = get_micro_second_time
10
10
 
@@ -25,10 +25,11 @@ module Pulse
25
25
 
26
26
  def extract_file_urls(response, custom_path_root, type)
27
27
  return [] if response.body.nil? || response.body.empty?
28
- (
28
+
29
+ remove_artefacts(
29
30
  extract_all_urls(response, custom_path_root, type) +
30
- extract_download_links(response, custom_path_root, type) +
31
- extract_embedded_images(response, custom_path_root, type)
31
+ extract_download_links(response, type) +
32
+ extract_embedded_images(response, type)
32
33
  ).uniq
33
34
  end
34
35
 
@@ -39,20 +40,20 @@ module Pulse
39
40
  .find_all { |u| u =~ /^https?:/ }
40
41
  .compact
41
42
  .select { |link| (link.include? type || link.include?(custom_path_root)) }
42
- .map { |link| add_base_url(link) }
43
+ .map { |link| add_base_url(link, custom_path_root) }
43
44
  end
44
45
 
45
- def extract_download_links(response, custom_path_root, type)
46
+ def extract_download_links(response, type)
46
47
  parse_html(response.body)
47
48
  .css('a')
48
49
  .to_a
49
50
  .map { |link| link['href'] }
50
51
  .compact
51
- .select { |link| (link.include? type || link.include?(custom_path_root)) }
52
+ .select { |link| (link.include? type) }
52
53
  .map { |link| add_base_url(link) }
53
54
  end
54
55
 
55
- def extract_embedded_images(response, custom_path_root, type)
56
+ def extract_embedded_images(response, type)
56
57
  return [] unless scrape_images
57
58
 
58
59
  parse_html(response.body)
@@ -60,15 +61,34 @@ module Pulse
60
61
  .to_a
61
62
  .map { |e| e["src"] }
62
63
  .compact
63
- .select { |link| (link.include? type || link.include?(custom_path_root)) }
64
+ .select { |link| (link.include? type) }
64
65
  .map { |link| add_base_url(link) }
65
66
  end
66
67
 
68
+ def remove_artefacts(urls)
69
+ urls = remove_extra_escape_characters(urls)
70
+ remove_base64(urls)
71
+ end
72
+
73
+ def remove_extra_escape_characters(urls)
74
+ urls.map do |url|
75
+ url.gsub("\">", '')
76
+ end
77
+ end
78
+
79
+ def remove_base64(urls)
80
+ urls.reject do |url|
81
+ url.include?(':image/') || url.include?('base64')
82
+ end
83
+ end
84
+
67
85
  def parse_html(raw_html)
68
86
  Nokogiri::HTML(raw_html)
69
87
  end
70
88
 
71
- def add_base_url(str)
89
+ def add_base_url(str, custom_path_root=nil)
90
+ return str if custom_path_root
91
+
72
92
  if !str.include?('https://') && !str.include?(base_url)
73
93
  "https://#{base_url}#{str}"
74
94
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pulse-downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.25
4
+ version: 0.1.30
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-10 00:00:00.000000000 Z
11
+ date: 2021-05-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: httparty