pulse-downloader 0.1.24 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/Gemfile.lock +2 -2
 - data/README.md +2 -1
 - data/lib/pulse/downloader/client.rb +3 -0
 - data/lib/pulse/downloader/file_downloader.rb +4 -1
 - data/lib/pulse/downloader/version.rb +1 -1
 - data/lib/pulse/downloader/web_page_parser.rb +40 -9
 - metadata +2 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: ccce085a74c0c4f7710a6e3c7431b191e77bf313a4219ec08e8cd34e78e3340b
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: ad2307a1ed1d368fd53a9fa965d7e864a90a263dac689656c49e8a5f62484d9f
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 654608db5cd95af6aafde8741864bda84dda783348d0d85baf675418b51c65def47c2ce671910f4313a91c0899784ae92e0d4c6da8e43926fafeaf2053ee2c02
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 520111b61ec241a70321f40707659c09317df3d2b950702be65758bb0a29e6c3bf7880fd7c61e57dafb4450ea2c05b51f834dcb3a2c9ad98ca06f3d61d5c327d
         
     | 
    
        data/Gemfile.lock
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            PATH
         
     | 
| 
       2 
2 
     | 
    
         
             
              remote: .
         
     | 
| 
       3 
3 
     | 
    
         
             
              specs:
         
     | 
| 
       4 
     | 
    
         
            -
                pulse-downloader (0.1. 
     | 
| 
      
 4 
     | 
    
         
            +
                pulse-downloader (0.1.26)
         
     | 
| 
       5 
5 
     | 
    
         
             
                  active_attr (~> 0.15)
         
     | 
| 
       6 
6 
     | 
    
         
             
                  httparty (~> 0.18)
         
     | 
| 
       7 
7 
     | 
    
         
             
                  nokogiri (~> 1.11)
         
     | 
| 
         @@ -69,7 +69,7 @@ GEM 
     | 
|
| 
       69 
69 
     | 
    
         
             
                  ruby-progressbar
         
     | 
| 
       70 
70 
     | 
    
         
             
                mocha (1.11.2)
         
     | 
| 
       71 
71 
     | 
    
         
             
                multi_xml (0.6.0)
         
     | 
| 
       72 
     | 
    
         
            -
                nokogiri (1.11. 
     | 
| 
      
 72 
     | 
    
         
            +
                nokogiri (1.11.5-x86_64-linux)
         
     | 
| 
       73 
73 
     | 
    
         
             
                  racc (~> 1.4)
         
     | 
| 
       74 
74 
     | 
    
         
             
                options (2.3.2)
         
     | 
| 
       75 
75 
     | 
    
         
             
                progress_bar (1.3.3)
         
     | 
    
        data/README.md
    CHANGED
    
    
| 
         @@ -13,6 +13,7 @@ module Pulse 
     | 
|
| 
       13 
13 
     | 
    
         
             
                    :save_path,
         
     | 
| 
       14 
14 
     | 
    
         
             
                    :read_from_save_path,
         
     | 
| 
       15 
15 
     | 
    
         
             
                    :verify_ssl,
         
     | 
| 
      
 16 
     | 
    
         
            +
                    :headers,
         
     | 
| 
       16 
17 
     | 
    
         
             
                    :drop_exitsing_files_in_path,
         
     | 
| 
       17 
18 
     | 
    
         
             
                    :save_and_dont_return,
         
     | 
| 
       18 
19 
     | 
    
         
             
                    :report_time,
         
     | 
| 
         @@ -36,6 +37,7 @@ module Pulse 
     | 
|
| 
       36 
37 
     | 
    
         
             
                    save_path: '',
         
     | 
| 
       37 
38 
     | 
    
         
             
                    read_from_save_path: false,
         
     | 
| 
       38 
39 
     | 
    
         
             
                    verify_ssl: true,
         
     | 
| 
      
 40 
     | 
    
         
            +
                    headers: nil,
         
     | 
| 
       39 
41 
     | 
    
         
             
                    drop_exitsing_files_in_path: false,
         
     | 
| 
       40 
42 
     | 
    
         
             
                    save_and_dont_return: true,
         
     | 
| 
       41 
43 
     | 
    
         
             
                    report_time: false,
         
     | 
| 
         @@ -48,6 +50,7 @@ module Pulse 
     | 
|
| 
       48 
50 
     | 
    
         
             
                    @save_path = save_path
         
     | 
| 
       49 
51 
     | 
    
         
             
                    @read_from_save_path = read_from_save_path
         
     | 
| 
       50 
52 
     | 
    
         
             
                    @verify_ssl = verify_ssl
         
     | 
| 
      
 53 
     | 
    
         
            +
                    @headers = headers
         
     | 
| 
       51 
54 
     | 
    
         
             
                    @drop_exitsing_files_in_path = drop_exitsing_files_in_path
         
     | 
| 
       52 
55 
     | 
    
         
             
                    @save_and_dont_return = save_and_dont_return
         
     | 
| 
       53 
56 
     | 
    
         
             
                    @report_time = report_time
         
     | 
| 
         @@ -8,7 +8,10 @@ module Pulse 
     | 
|
| 
       8 
8 
     | 
    
         | 
| 
       9 
9 
     | 
    
         
             
                    @start_time = get_micro_second_time
         
     | 
| 
       10 
10 
     | 
    
         | 
| 
       11 
     | 
    
         
            -
                    file_data = HTTParty.get( 
     | 
| 
      
 11 
     | 
    
         
            +
                    file_data = HTTParty.get(
         
     | 
| 
      
 12 
     | 
    
         
            +
                      escape(compute_file_link(file_path)),
         
     | 
| 
      
 13 
     | 
    
         
            +
                      verify: verify_ssl
         
     | 
| 
      
 14 
     | 
    
         
            +
                    )
         
     | 
| 
       12 
15 
     | 
    
         | 
| 
       13 
16 
     | 
    
         
             
                    @end_time = get_micro_second_time
         
     | 
| 
       14 
17 
     | 
    
         | 
| 
         @@ -4,7 +4,7 @@ module Pulse 
     | 
|
| 
       4 
4 
     | 
    
         
             
                  def fetch_file_paths(custom_path_root=nil)
         
     | 
| 
       5 
5 
     | 
    
         
             
                    @start_time = get_micro_second_time
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
     | 
    
         
            -
                    response = HTTParty.get(url, verify: verify_ssl)
         
     | 
| 
      
 7 
     | 
    
         
            +
                    response = HTTParty.get(url, verify: verify_ssl, headers: headers)
         
     | 
| 
       8 
8 
     | 
    
         | 
| 
       9 
9 
     | 
    
         
             
                    @end_time = get_micro_second_time
         
     | 
| 
       10 
10 
     | 
    
         | 
| 
         @@ -25,23 +25,35 @@ module Pulse 
     | 
|
| 
       25 
25 
     | 
    
         | 
| 
       26 
26 
     | 
    
         
             
                  def extract_file_urls(response, custom_path_root, type)
         
     | 
| 
       27 
27 
     | 
    
         
             
                    return [] if response.body.nil? || response.body.empty?
         
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                    remove_base64(
         
     | 
| 
      
 30 
     | 
    
         
            +
                      extract_all_urls(response, custom_path_root, type) +
         
     | 
| 
      
 31 
     | 
    
         
            +
                        extract_download_links(response, type) +
         
     | 
| 
      
 32 
     | 
    
         
            +
                        extract_embedded_images(response, type)
         
     | 
| 
       31 
33 
     | 
    
         
             
                    ).uniq
         
     | 
| 
       32 
34 
     | 
    
         
             
                  end
         
     | 
| 
       33 
35 
     | 
    
         | 
| 
       34 
     | 
    
         
            -
                  def  
     | 
| 
      
 36 
     | 
    
         
            +
                  def extract_all_urls(response, custom_path_root, type)
         
     | 
| 
      
 37 
     | 
    
         
            +
                    parse_html(response.body)
         
     | 
| 
      
 38 
     | 
    
         
            +
                      .to_s
         
     | 
| 
      
 39 
     | 
    
         
            +
                      .split(/\s+/)
         
     | 
| 
      
 40 
     | 
    
         
            +
                      .find_all { |u| u =~ /^https?:/ }
         
     | 
| 
      
 41 
     | 
    
         
            +
                      .compact
         
     | 
| 
      
 42 
     | 
    
         
            +
                      .select { |link| (link.include? type || link.include?(custom_path_root)) }
         
     | 
| 
      
 43 
     | 
    
         
            +
                      .map { |link| add_base_url(link, custom_path_root) }
         
     | 
| 
      
 44 
     | 
    
         
            +
                  end
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                  def extract_download_links(response, type)
         
     | 
| 
       35 
47 
     | 
    
         
             
                    parse_html(response.body)
         
     | 
| 
       36 
48 
     | 
    
         
             
                      .css('a')
         
     | 
| 
       37 
49 
     | 
    
         
             
                      .to_a
         
     | 
| 
       38 
50 
     | 
    
         
             
                      .map { |link| link['href'] }
         
     | 
| 
       39 
51 
     | 
    
         
             
                      .compact
         
     | 
| 
       40 
     | 
    
         
            -
                      .select { |link| (link.include? type 
     | 
| 
      
 52 
     | 
    
         
            +
                      .select { |link| (link.include? type) }
         
     | 
| 
       41 
53 
     | 
    
         
             
                      .map { |link| add_base_url(link) }
         
     | 
| 
       42 
54 
     | 
    
         
             
                  end
         
     | 
| 
       43 
55 
     | 
    
         | 
| 
       44 
     | 
    
         
            -
                  def extract_embedded_images(response,  
     | 
| 
      
 56 
     | 
    
         
            +
                  def extract_embedded_images(response, type)
         
     | 
| 
       45 
57 
     | 
    
         
             
                    return [] unless scrape_images
         
     | 
| 
       46 
58 
     | 
    
         | 
| 
       47 
59 
     | 
    
         
             
                    parse_html(response.body)
         
     | 
| 
         @@ -49,15 +61,34 @@ module Pulse 
     | 
|
| 
       49 
61 
     | 
    
         
             
                      .to_a
         
     | 
| 
       50 
62 
     | 
    
         
             
                      .map { |e| e["src"] }
         
     | 
| 
       51 
63 
     | 
    
         
             
                      .compact
         
     | 
| 
       52 
     | 
    
         
            -
                      .select { |link| (link.include? type 
     | 
| 
      
 64 
     | 
    
         
            +
                      .select { |link| (link.include? type) }
         
     | 
| 
       53 
65 
     | 
    
         
             
                      .map { |link| add_base_url(link) }
         
     | 
| 
       54 
66 
     | 
    
         
             
                  end
         
     | 
| 
       55 
67 
     | 
    
         | 
| 
      
 68 
     | 
    
         
            +
                  def remove_artefacts(urls)
         
     | 
| 
      
 69 
     | 
    
         
            +
                    urls = remove_extra_escape_characters(urls)
         
     | 
| 
      
 70 
     | 
    
         
            +
                    remove_base64(urls)
         
     | 
| 
      
 71 
     | 
    
         
            +
                  end
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
                  def remove_extra_escape_characters(urls)
         
     | 
| 
      
 74 
     | 
    
         
            +
                    urls.map do |url|
         
     | 
| 
      
 75 
     | 
    
         
            +
                      url.gsub("\">", '')
         
     | 
| 
      
 76 
     | 
    
         
            +
                    end
         
     | 
| 
      
 77 
     | 
    
         
            +
                  end
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
                  def remove_base64(urls)
         
     | 
| 
      
 80 
     | 
    
         
            +
                    urls.reject do |url|
         
     | 
| 
      
 81 
     | 
    
         
            +
                      url.include?(':image/') || url.include?('base64')
         
     | 
| 
      
 82 
     | 
    
         
            +
                    end
         
     | 
| 
      
 83 
     | 
    
         
            +
                  end
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
       56 
85 
     | 
    
         
             
                  def parse_html(raw_html)
         
     | 
| 
       57 
86 
     | 
    
         
             
                    Nokogiri::HTML(raw_html)
         
     | 
| 
       58 
87 
     | 
    
         
             
                  end
         
     | 
| 
       59 
88 
     | 
    
         | 
| 
       60 
     | 
    
         
            -
                  def add_base_url(str)
         
     | 
| 
      
 89 
     | 
    
         
            +
                  def add_base_url(str, custom_path_root=nil)
         
     | 
| 
      
 90 
     | 
    
         
            +
                    return str if custom_path_root
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
       61 
92 
     | 
    
         
             
                    if !str.include?('https://') && !str.include?(base_url)
         
     | 
| 
       62 
93 
     | 
    
         
             
                      "https://#{base_url}#{str}"
         
     | 
| 
       63 
94 
     | 
    
         
             
                    else
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: pulse-downloader
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.29
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - trex22
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2021-05- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2021-05-24 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: httparty
         
     |