pulse-downloader 0.1.22 → 0.1.27

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 00f53bf497e8c65d6211956814fa94613857b23bc0b1e3f35c24d69998645941
4
- data.tar.gz: 77447e1a3b1df593d5c836074c41e6caa57fef28f8926a5f7e1859fb1b58b39d
3
+ metadata.gz: ed736b97e09aaec1104ac4e817b981c2ebb88fbac583abe0ebf44e721143b087
4
+ data.tar.gz: a81646945f24e428738e8a514e671f2f14682f5824faee1e63e07bd498fe519c
5
5
  SHA512:
6
- metadata.gz: 627db0d3d117a199943ba7ade0c27af871f4934e6879bb788fcade5946b95bfa0378d237fb5eee0f68685cf211b042a3b35986441466b0d6811875fe7103304f
7
- data.tar.gz: 1f7c99a75487b6c615cb94bd8def2ed1f5316036339318244329648be8800dc5ab867a8022e956e1a46ac0e0cb58f4502030453c7efadd7131d7edbb69a5a538
6
+ metadata.gz: b684940cab23055ec977672dd8472fcb38ebf8e70e59d18f2a76270a3234f5873c7a6bee47877760c54733af2c073bbaa7a05eec33266a33df316482baa6d836
7
+ data.tar.gz: 9b1737bc6607585c6fb6081d9c1b3031bd53ffccc5cce72a0650b163792742dde2b7dffa6ed663775a7cd1cad50f1fb964c07bc555ceee7a2ca66637af25b41a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- pulse-downloader (0.1.11)
4
+ pulse-downloader (0.1.26)
5
5
  active_attr (~> 0.15)
6
6
  httparty (~> 0.18)
7
7
  nokogiri (~> 1.11)
@@ -69,7 +69,7 @@ GEM
69
69
  ruby-progressbar
70
70
  mocha (1.11.2)
71
71
  multi_xml (0.6.0)
72
- nokogiri (1.11.3-x86_64-linux)
72
+ nokogiri (1.11.5-x86_64-linux)
73
73
  racc (~> 1.4)
74
74
  options (2.3.2)
75
75
  progress_bar (1.3.3)
data/README.md CHANGED
@@ -33,7 +33,8 @@ client = Pulse::Downloader::Client.new(
33
33
  drop_exitsing_files_in_path: false,
34
34
  save_and_dont_return: true,
35
35
  report_time: false,
36
- progress_bar: true
36
+ progress_bar: true,
37
+ scrape_images: true
37
38
  )
38
39
 
39
40
  client.call!
@@ -13,6 +13,7 @@ module Pulse
13
13
  :save_path,
14
14
  :read_from_save_path,
15
15
  :verify_ssl,
16
+ :headers,
16
17
  :drop_exitsing_files_in_path,
17
18
  :save_and_dont_return,
18
19
  :report_time,
@@ -36,6 +37,7 @@ module Pulse
36
37
  save_path: '',
37
38
  read_from_save_path: false,
38
39
  verify_ssl: true,
40
+ headers: nil,
39
41
  drop_exitsing_files_in_path: false,
40
42
  save_and_dont_return: true,
41
43
  report_time: false,
@@ -48,6 +50,7 @@ module Pulse
48
50
  @save_path = save_path
49
51
  @read_from_save_path = read_from_save_path
50
52
  @verify_ssl = verify_ssl
53
+ @headers = headers
51
54
  @drop_exitsing_files_in_path = drop_exitsing_files_in_path
52
55
  @save_and_dont_return = save_and_dont_return
53
56
  @report_time = report_time
@@ -8,7 +8,11 @@ module Pulse
8
8
 
9
9
  @start_time = get_micro_second_time
10
10
 
11
- file_data = HTTParty.get(escape(compute_file_link(file_path)), verify: verify_ssl)
11
+ file_data = HTTParty.get(
12
+ escape(compute_file_link(file_path)),
13
+ verify: verify_ssl,
14
+ headers: headers
15
+ )
12
16
 
13
17
  @end_time = get_micro_second_time
14
18
 
@@ -1,5 +1,5 @@
1
1
  module Pulse
2
2
  module Downloader
3
- VERSION = "0.1.22"
3
+ VERSION = "0.1.27"
4
4
  end
5
5
  end
@@ -1,7 +1,7 @@
1
1
  module Pulse
2
2
  module Downloader
3
3
  module WebPageParser
4
- def fetch_file_paths
4
+ def fetch_file_paths(custom_path_root=nil)
5
5
  @start_time = get_micro_second_time
6
6
 
7
7
  response = HTTParty.get(url, verify: verify_ssl)
@@ -12,29 +12,47 @@ module Pulse
12
12
  print_time
13
13
  end
14
14
 
15
- extract_file_urls(response)
15
+ if file_type.is_a?(Array)
16
+ file_type.flat_map do |type|
17
+ extract_file_urls(response, custom_path_root, type)
18
+ end
19
+ else
20
+ extract_file_urls(response, custom_path_root, file_type)
21
+ end
16
22
  end
17
23
 
18
24
  private
19
25
 
20
- def extract_file_urls(response)
26
+ def extract_file_urls(response, custom_path_root, type)
21
27
  return [] if response.body.nil? || response.body.empty?
22
- (
23
- extract_download_links(response) + extract_embedded_images(response)
28
+ remove_base64(
29
+ extract_all_urls(response, custom_path_root, type) +
30
+ extract_download_links(response, custom_path_root, type) +
31
+ extract_embedded_images(response, custom_path_root, type)
24
32
  ).uniq
25
33
  end
26
34
 
27
- def extract_download_links(response)
35
+ def extract_all_urls(response, custom_path_root, type)
36
+ parse_html(response.body)
37
+ .to_s
38
+ .split(/\s+/)
39
+ .find_all { |u| u =~ /^https?:/ }
40
+ .compact
41
+ .select { |link| (link.include? type || link.include?(custom_path_root)) }
42
+ .map { |link| add_base_url(link) }
43
+ end
44
+
45
+ def extract_download_links(response, custom_path_root, type)
28
46
  parse_html(response.body)
29
47
  .css('a')
30
48
  .to_a
31
49
  .map { |link| link['href'] }
32
50
  .compact
33
- .select { |link| link.include? file_type }
51
+ .select { |link| (link.include? type || link.include?(custom_path_root)) }
34
52
  .map { |link| add_base_url(link) }
35
53
  end
36
54
 
37
- def extract_embedded_images(response)
55
+ def extract_embedded_images(response, custom_path_root, type)
38
56
  return [] unless scrape_images
39
57
 
40
58
  parse_html(response.body)
@@ -42,10 +60,16 @@ module Pulse
42
60
  .to_a
43
61
  .map { |e| e["src"] }
44
62
  .compact
45
- .select { |link| link.include? file_type }
63
+ .select { |link| (link.include? type || link.include?(custom_path_root)) }
46
64
  .map { |link| add_base_url(link) }
47
65
  end
48
66
 
67
+ def remove_base64(urls)
68
+ urls.reject do |url|
69
+ url.include?(':image/') || url.include?('base64')
70
+ end
71
+ end
72
+
49
73
  def parse_html(raw_html)
50
74
  Nokogiri::HTML(raw_html)
51
75
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pulse-downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.22
4
+ version: 0.1.27
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-10 00:00:00.000000000 Z
11
+ date: 2021-05-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: httparty