pulse-downloader 0.1.23 → 0.1.28

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e354feaf3ad68d91b2b58b1410ad297582ace9b6f3493b17cf8ae11d11c18d8d
4
- data.tar.gz: 3e036681abb31307b958bba6038d1fda3a9634195d71c61af3e8896bd695b22a
3
+ metadata.gz: 2a9e6c9185000eb45fe03fcf5ec6739aa0b4df48cd9733654cc2f312ca156b54
4
+ data.tar.gz: f03dfbbbecbd75ffe795b194c0ac19689021e1a74a4668ffb6573b6193eb0627
5
5
  SHA512:
6
- metadata.gz: cf401b59b944a0bdca308998c19f4c8b996fadf60cc8118c06a909b5eaa1ca98d9f0f3d234af7050ce71611475d8f9a9c4a79503003649870a5d7f18dc62caa0
7
- data.tar.gz: 98bd01ea727425dad6a932fd431c241b023d954134827ac781faab9b90559a3d05c75916eb4e0bf99d4f8f4c4244b2ce7b019df8ced9c474ad8efe18fa4e7e7d
6
+ metadata.gz: 706da41a31ad0fda7b4a17b748f8271f758973510039d2e3eac79115761ef71e970eee042d3ef30ed2a080ae7e64f49584d8110c249022a4bca3029c48cc1024
7
+ data.tar.gz: 97c3c92845700c69343eba19fb93a9b9de71fe44ceca0ff986e25df8cc69ffb71a48c778eaa145df4ae802295b148feaee768814c52124e4ca74079e8ca9e807
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- pulse-downloader (0.1.11)
4
+ pulse-downloader (0.1.26)
5
5
  active_attr (~> 0.15)
6
6
  httparty (~> 0.18)
7
7
  nokogiri (~> 1.11)
@@ -69,7 +69,7 @@ GEM
69
69
  ruby-progressbar
70
70
  mocha (1.11.2)
71
71
  multi_xml (0.6.0)
72
- nokogiri (1.11.3-x86_64-linux)
72
+ nokogiri (1.11.5-x86_64-linux)
73
73
  racc (~> 1.4)
74
74
  options (2.3.2)
75
75
  progress_bar (1.3.3)
data/README.md CHANGED
@@ -33,7 +33,8 @@ client = Pulse::Downloader::Client.new(
33
33
  drop_exitsing_files_in_path: false,
34
34
  save_and_dont_return: true,
35
35
  report_time: false,
36
- progress_bar: true
36
+ progress_bar: true,
37
+ scrape_images: true
37
38
  )
38
39
 
39
40
  client.call!
@@ -13,6 +13,7 @@ module Pulse
13
13
  :save_path,
14
14
  :read_from_save_path,
15
15
  :verify_ssl,
16
+ :headers,
16
17
  :drop_exitsing_files_in_path,
17
18
  :save_and_dont_return,
18
19
  :report_time,
@@ -36,6 +37,7 @@ module Pulse
36
37
  save_path: '',
37
38
  read_from_save_path: false,
38
39
  verify_ssl: true,
40
+ headers: nil,
39
41
  drop_exitsing_files_in_path: false,
40
42
  save_and_dont_return: true,
41
43
  report_time: false,
@@ -48,6 +50,7 @@ module Pulse
48
50
  @save_path = save_path
49
51
  @read_from_save_path = read_from_save_path
50
52
  @verify_ssl = verify_ssl
53
+ @headers = headers
51
54
  @drop_exitsing_files_in_path = drop_exitsing_files_in_path
52
55
  @save_and_dont_return = save_and_dont_return
53
56
  @report_time = report_time
@@ -8,7 +8,10 @@ module Pulse
8
8
 
9
9
  @start_time = get_micro_second_time
10
10
 
11
- file_data = HTTParty.get(escape(compute_file_link(file_path)), verify: verify_ssl)
11
+ file_data = HTTParty.get(
12
+ escape(compute_file_link(file_path)),
13
+ verify: verify_ssl
14
+ )
12
15
 
13
16
  @end_time = get_micro_second_time
14
17
 
@@ -1,5 +1,5 @@
1
1
  module Pulse
2
2
  module Downloader
3
- VERSION = "0.1.23"
3
+ VERSION = "0.1.28"
4
4
  end
5
5
  end
@@ -4,7 +4,7 @@ module Pulse
4
4
  def fetch_file_paths(custom_path_root=nil)
5
5
  @start_time = get_micro_second_time
6
6
 
7
- response = HTTParty.get(url, verify: verify_ssl)
7
+ response = HTTParty.get(url, verify: verify_ssl, headers: headers)
8
8
 
9
9
  @end_time = get_micro_second_time
10
10
 
@@ -12,30 +12,48 @@ module Pulse
12
12
  print_time
13
13
  end
14
14
 
15
- extract_file_urls(response, custom_path_root)
15
+ if file_type.is_a?(Array)
16
+ file_type.flat_map do |type|
17
+ extract_file_urls(response, custom_path_root, type)
18
+ end
19
+ else
20
+ extract_file_urls(response, custom_path_root, file_type)
21
+ end
16
22
  end
17
23
 
18
24
  private
19
25
 
20
- def extract_file_urls(response, custom_path_root)
26
+ def extract_file_urls(response, custom_path_root, type)
21
27
  return [] if response.body.nil? || response.body.empty?
22
- (
23
- extract_download_links(response, custom_path_root) +
24
- extract_embedded_images(response, custom_path_root)
28
+
29
+ remove_base64(
30
+ extract_all_urls(response, custom_path_root, type) +
31
+ extract_download_links(response, type) +
32
+ extract_embedded_images(response, type)
25
33
  ).uniq
26
34
  end
27
35
 
28
- def extract_download_links(response, custom_path_root)
36
+ def extract_all_urls(response, custom_path_root, type)
37
+ parse_html(response.body)
38
+ .to_s
39
+ .split(/\s+/)
40
+ .find_all { |u| u =~ /^https?:/ }
41
+ .compact
42
+ .select { |link| (link.include? type || link.include?(custom_path_root)) }
43
+ .map { |link| add_base_url(link) }
44
+ end
45
+
46
+ def extract_download_links(response, type)
29
47
  parse_html(response.body)
30
48
  .css('a')
31
49
  .to_a
32
50
  .map { |link| link['href'] }
33
51
  .compact
34
- .select { |link| (link.include? file_type || link.include?(custom_path_root)) }
52
+ .select { |link| (link.include? type) }
35
53
  .map { |link| add_base_url(link) }
36
54
  end
37
55
 
38
- def extract_embedded_images(response, custom_path_root)
56
+ def extract_embedded_images(response, type)
39
57
  return [] unless scrape_images
40
58
 
41
59
  parse_html(response.body)
@@ -43,10 +61,16 @@ module Pulse
43
61
  .to_a
44
62
  .map { |e| e["src"] }
45
63
  .compact
46
- .select { |link| (link.include? file_type || link.include?(custom_path_root)) }
64
+ .select { |link| (link.include? type) }
47
65
  .map { |link| add_base_url(link) }
48
66
  end
49
67
 
68
+ def remove_base64(urls)
69
+ urls.reject do |url|
70
+ url.include?(':image/') || url.include?('base64')
71
+ end
72
+ end
73
+
50
74
  def parse_html(raw_html)
51
75
  Nokogiri::HTML(raw_html)
52
76
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pulse-downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.23
4
+ version: 0.1.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-10 00:00:00.000000000 Z
11
+ date: 2021-05-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: httparty