datahen 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/lib/datahen/cli/scraper_page.rb +28 -26
 - data/lib/datahen/client/job_page.rb +10 -20
 - data/lib/datahen/client/scraper_job_page.rb +9 -20
 - data/lib/datahen/scraper/batch_parser.rb +1 -1
 - data/lib/datahen/version.rb +1 -1
 - metadata +6 -6
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: ae63999d11bc052d81e3b1de67a0741702dd980e719dd544b5e689f0383e7a34
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: faf53b662afa26409bff83c3007127211863ce33ff45b8c60aab56491fdcafe7
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: b0e7a0ddc975202df66785211cee796e1aef61de921f99ba0481645f59fb65963c03517e1d4f5b471d2ed108087011f20d6b693d6d199e2e96c860412b675415
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 70a13268ba6f3df8f560a1b4d65b261ec4744bbd71eb22b04c25dc5809e70af3541cc30d9dad0e4a4cc22c3a353b90bc24c4a43cfc70e7f82fc2080001596d38
         
     | 
| 
         @@ -28,46 +28,48 @@ module Datahen 
     | 
|
| 
       28 
28 
     | 
    
         
             
                    end
         
     | 
| 
       29 
29 
     | 
    
         
             
                  end
         
     | 
| 
       30 
30 
     | 
    
         | 
| 
       31 
     | 
    
         
            -
                  desc "add <scraper_name> < 
     | 
| 
      
 31 
     | 
    
         
            +
                  desc "add <scraper_name> <page_json>", "Enqueues a page to a scraper's current job"
         
     | 
| 
       32 
32 
     | 
    
         
             
                  long_desc <<-LONGDESC
         
     | 
| 
       33 
33 
     | 
    
         
             
                      Enqueues a page to a scraper's current job\x5
         
     | 
| 
       34 
34 
     | 
    
         
             
                      LONGDESC
         
     | 
| 
       35 
35 
     | 
    
         
             
                  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
         
     | 
| 
       36 
     | 
    
         
            -
                   
     | 
| 
       37 
     | 
    
         
            -
                  option :headers, :aliases => :H, type: :string, banner: :JSON, desc: 'Set request headers. Must be in json format. i.e: {"Foo":"bar"} '
         
     | 
| 
       38 
     | 
    
         
            -
                  option :cookie, :aliases => :c, type: :string, desc: 'Set request cookie.'
         
     | 
| 
       39 
     | 
    
         
            -
                  option :vars, :aliases => :v, type: :string, banner: :JSON, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
         
     | 
| 
       40 
     | 
    
         
            -
                  option :page_type, :aliases => :t, desc: 'Set page type'
         
     | 
| 
       41 
     | 
    
         
            -
                  option :priority, type: :numeric, desc: 'Set fetch priority. The higher the value, the sooner the page gets fetched. Default: 0'
         
     | 
| 
       42 
     | 
    
         
            -
                  option :fetch_type, :aliases => :F, desc: 'Set fetch type. Default: http'
         
     | 
| 
       43 
     | 
    
         
            -
                  option :body, :aliases => :b, desc: 'Set request body'
         
     | 
| 
       44 
     | 
    
         
            -
                  option :force_fetch, :aliases => :f, type: :boolean, desc: 'Set true to force fetch page that is not within freshness criteria. Default: false'
         
     | 
| 
       45 
     | 
    
         
            -
                  option :freshness, :aliases => :s, desc: 'Set how fresh the page cache is. Accepts timestap format.'
         
     | 
| 
       46 
     | 
    
         
            -
                  option :ua_type, :aliases => :u, desc: 'Set user agent type. Default: desktop'
         
     | 
| 
       47 
     | 
    
         
            -
                  option :no_redirect, :aliases => :n, type: :boolean, desc: 'Set true to not follow redirect. Default: false'
         
     | 
| 
       48 
     | 
    
         
            -
                  option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
         
     | 
| 
       49 
     | 
    
         
            -
                  option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
         
     | 
| 
       50 
     | 
    
         
            -
                  def add(scraper_name, url)
         
     | 
| 
      
 36 
     | 
    
         
            +
                  def add(scraper_name, page_json)
         
     | 
| 
       51 
37 
     | 
    
         
             
                    begin
         
     | 
| 
       52 
     | 
    
         
            -
                       
     | 
| 
       53 
     | 
    
         
            -
                      options[:vars] = JSON.parse(options[:vars]) if options[:vars]
         
     | 
| 
       54 
     | 
    
         
            -
                      method = options[:method]
         
     | 
| 
      
 38 
     | 
    
         
            +
                      page = JSON.parse(page_json)
         
     | 
| 
       55 
39 
     | 
    
         | 
| 
       56 
40 
     | 
    
         
             
                      if options[:job]
         
     | 
| 
       57 
41 
     | 
    
         
             
                        client = Client::JobPage.new(options)
         
     | 
| 
       58 
     | 
    
         
            -
                        puts "#{client.enqueue(options[:job],  
     | 
| 
      
 42 
     | 
    
         
            +
                        puts "#{client.enqueue(options[:job], page, options)}"
         
     | 
| 
       59 
43 
     | 
    
         
             
                      else
         
     | 
| 
       60 
44 
     | 
    
         
             
                        client = Client::ScraperJobPage.new(options)
         
     | 
| 
       61 
     | 
    
         
            -
                        puts "#{client.enqueue(scraper_name,  
     | 
| 
      
 45 
     | 
    
         
            +
                        puts "#{client.enqueue(scraper_name, page, options)}"
         
     | 
| 
       62 
46 
     | 
    
         
             
                      end
         
     | 
| 
       63 
47 
     | 
    
         | 
| 
       64 
48 
     | 
    
         
             
                    rescue JSON::ParserError
         
     | 
| 
       65 
     | 
    
         
            -
             
     | 
| 
       66 
     | 
    
         
            -
             
     | 
| 
       67 
     | 
    
         
            -
             
     | 
| 
       68 
     | 
    
         
            -
             
     | 
| 
       69 
     | 
    
         
            -
             
     | 
| 
      
 49 
     | 
    
         
            +
                        puts "Error: Invalid JSON"
         
     | 
| 
      
 50 
     | 
    
         
            +
                    end
         
     | 
| 
      
 51 
     | 
    
         
            +
                  end
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                  desc "getgid <scraper_name> <page_json>", "Get the generated GID for a scraper's current job"
         
     | 
| 
      
 55 
     | 
    
         
            +
                  long_desc <<-LONGDESC
         
     | 
| 
      
 56 
     | 
    
         
            +
                      Get the generated GID for a scraper's current job.\x5
         
     | 
| 
      
 57 
     | 
    
         
            +
                      LONGDESC
         
     | 
| 
      
 58 
     | 
    
         
            +
                  option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
         
     | 
| 
      
 59 
     | 
    
         
            +
                  def getgid(scraper_name, page_json)
         
     | 
| 
      
 60 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 61 
     | 
    
         
            +
                      page = JSON.parse(page_json)
         
     | 
| 
      
 62 
     | 
    
         
            +
                      
         
     | 
| 
      
 63 
     | 
    
         
            +
                      if options[:job]
         
     | 
| 
      
 64 
     | 
    
         
            +
                        client = Client::JobPage.new(options)
         
     | 
| 
      
 65 
     | 
    
         
            +
                        puts "#{client.get_gid(options[:job], page,  options)}"
         
     | 
| 
      
 66 
     | 
    
         
            +
                      else
         
     | 
| 
      
 67 
     | 
    
         
            +
                        client = Client::ScraperJobPage.new(options)
         
     | 
| 
      
 68 
     | 
    
         
            +
                        puts "#{client.get_gid(scraper_name, page, options)}"
         
     | 
| 
       70 
69 
     | 
    
         
             
                      end
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
                    rescue JSON::ParserError
         
     | 
| 
      
 72 
     | 
    
         
            +
                      puts "Error: Invalid JSON"
         
     | 
| 
       71 
73 
     | 
    
         
             
                    end
         
     | 
| 
       72 
74 
     | 
    
         
             
                  end
         
     | 
| 
       73 
75 
     | 
    
         | 
| 
         @@ -24,28 +24,18 @@ module Datahen 
     | 
|
| 
       24 
24 
     | 
    
         
             
                    self.class.put("/jobs/#{job_id}/pages/#{gid}", params)
         
     | 
| 
       25 
25 
     | 
    
         
             
                  end
         
     | 
| 
       26 
26 
     | 
    
         | 
| 
       27 
     | 
    
         
            -
                  def enqueue(job_id,  
     | 
| 
       28 
     | 
    
         
            -
                     
     | 
| 
       29 
     | 
    
         
            -
                    body[:method] =  method != "" ? method : "GET"
         
     | 
| 
       30 
     | 
    
         
            -
                    body[:url] =  url
         
     | 
| 
       31 
     | 
    
         
            -
                    body[:page_type] = opts[:page_type] if opts[:page_type]
         
     | 
| 
       32 
     | 
    
         
            -
                    body[:priority] = opts[:priority] if opts[:priority]
         
     | 
| 
       33 
     | 
    
         
            -
                    body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
         
     | 
| 
       34 
     | 
    
         
            -
                    body[:body] = opts[:body] if opts[:body]
         
     | 
| 
       35 
     | 
    
         
            -
                    body[:headers] = opts[:headers] if opts[:headers]
         
     | 
| 
       36 
     | 
    
         
            -
                    body[:vars] = opts[:vars] if opts[:vars]
         
     | 
| 
       37 
     | 
    
         
            -
                    body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
         
     | 
| 
       38 
     | 
    
         
            -
                    body[:freshness] = opts[:freshness] if opts[:freshness]
         
     | 
| 
       39 
     | 
    
         
            -
                    body[:ua_type] = opts[:ua_type] if opts[:ua_type]
         
     | 
| 
       40 
     | 
    
         
            -
                    body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
         
     | 
| 
       41 
     | 
    
         
            -
                    body[:cookie] = opts[:cookie] if opts[:cookie]
         
     | 
| 
       42 
     | 
    
         
            -
                    body[:max_size] = opts[:max_size] if opts[:max_size]
         
     | 
| 
       43 
     | 
    
         
            -
                    body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
         
     | 
| 
       44 
     | 
    
         
            -
                    body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
         
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
       46 
     | 
    
         
            -
                    params = @options.merge({body: body.to_json})
         
     | 
| 
      
 27 
     | 
    
         
            +
                  def enqueue(job_id, page, opts={})
         
     | 
| 
      
 28 
     | 
    
         
            +
                    params = @options.merge(opts).merge({body: page.to_json})
         
     | 
| 
       47 
29 
     | 
    
         | 
| 
       48 
30 
     | 
    
         
             
                    self.class.post("/jobs/#{job_id}/pages", params)
         
     | 
| 
      
 31 
     | 
    
         
            +
                    
         
     | 
| 
      
 32 
     | 
    
         
            +
                  end
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                  def get_gid(job_id, page, opts={})
         
     | 
| 
      
 35 
     | 
    
         
            +
                  
         
     | 
| 
      
 36 
     | 
    
         
            +
                    params = @options.merge(opts).merge({body: page.to_json})
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                    self.class.post("/jobs/#{job_id}/generate_gid", params)
         
     | 
| 
       49 
39 
     | 
    
         
             
                  end
         
     | 
| 
       50 
40 
     | 
    
         | 
| 
       51 
41 
     | 
    
         
             
                  def dequeue(job_id, limit, page_types, parse_fetching_failed, opts = {})
         
     | 
| 
         @@ -47,30 +47,19 @@ module Datahen 
     | 
|
| 
       47 
47 
     | 
    
         
             
                    self.class.put("/scrapers/#{scraper_name}/current_job/pages/limbo", params)
         
     | 
| 
       48 
48 
     | 
    
         
             
                  end
         
     | 
| 
       49 
49 
     | 
    
         | 
| 
       50 
     | 
    
         
            -
                  def enqueue(scraper_name,  
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
       52 
     | 
    
         
            -
                    body[:method] =  method != "" ? method : "GET"
         
     | 
| 
       53 
     | 
    
         
            -
                    body[:url] =  url
         
     | 
| 
       54 
     | 
    
         
            -
                    body[:page_type] = opts[:page_type] if opts[:page_type]
         
     | 
| 
       55 
     | 
    
         
            -
                    body[:priority] = opts[:priority] if opts[:priority]
         
     | 
| 
       56 
     | 
    
         
            -
                    body[:fetch_type] = opts[:fetch_type] if opts[:fetch_type]
         
     | 
| 
       57 
     | 
    
         
            -
                    body[:body] = opts[:body] if opts[:body]
         
     | 
| 
       58 
     | 
    
         
            -
                    body[:headers] = opts[:headers] if opts[:headers]
         
     | 
| 
       59 
     | 
    
         
            -
                    body[:vars] = opts[:vars] if opts[:vars]
         
     | 
| 
       60 
     | 
    
         
            -
                    body[:force_fetch] = opts[:force_fetch] if opts[:force_fetch]
         
     | 
| 
       61 
     | 
    
         
            -
                    body[:freshness] = opts[:freshness] if opts[:freshness]
         
     | 
| 
       62 
     | 
    
         
            -
                    body[:ua_type] = opts[:ua_type] if opts[:ua_type]
         
     | 
| 
       63 
     | 
    
         
            -
                    body[:no_redirect] = opts[:no_redirect] if opts[:no_redirect]
         
     | 
| 
       64 
     | 
    
         
            -
                    body[:cookie] = opts[:cookie] if opts[:cookie]
         
     | 
| 
       65 
     | 
    
         
            -
                    body[:max_size] = opts[:max_size] if opts[:max_size]
         
     | 
| 
       66 
     | 
    
         
            -
                    body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
         
     | 
| 
       67 
     | 
    
         
            -
                    body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
         
     | 
| 
       68 
     | 
    
         
            -
             
     | 
| 
       69 
     | 
    
         
            -
                    params = @options.merge({body: body.to_json})
         
     | 
| 
      
 50 
     | 
    
         
            +
                  def enqueue(scraper_name, page, opts={})
         
     | 
| 
      
 51 
     | 
    
         
            +
                  params = @options.merge(opts).merge({body: page.to_json})
         
     | 
| 
       70 
52 
     | 
    
         | 
| 
       71 
53 
     | 
    
         
             
                    self.class.post("/scrapers/#{scraper_name}/current_job/pages", params)
         
     | 
| 
       72 
54 
     | 
    
         
             
                  end
         
     | 
| 
       73 
55 
     | 
    
         | 
| 
      
 56 
     | 
    
         
            +
                  def get_gid(scraper_name, page, opts={})
         
     | 
| 
      
 57 
     | 
    
         
            +
                  
         
     | 
| 
      
 58 
     | 
    
         
            +
                    params = @options.merge(opts).merge({body: page.to_json})
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                    self.class.post("/scrapers/#{scraper_name}/current_job/generate_gid", params)
         
     | 
| 
      
 61 
     | 
    
         
            +
                  end
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
       74 
63 
     | 
    
         
             
                  def find_content(scraper_name, gid)
         
     | 
| 
       75 
64 
     | 
    
         
             
                    self.class.get("/scrapers/#{scraper_name}/current_job/pages/#{gid}/content", @options)
         
     | 
| 
       76 
65 
     | 
    
         
             
                  end
         
     | 
| 
         @@ -219,7 +219,7 @@ module Datahen 
     | 
|
| 
       219 
219 
     | 
    
         
             
                    self.dequeuer_is_alive!
         
     | 
| 
       220 
220 
     | 
    
         | 
| 
       221 
221 
     | 
    
         
             
                    # ensure a valid response or try again
         
     | 
| 
       222 
     | 
    
         
            -
                    if response.nil? || response.response.code.to_i != 200
         
     | 
| 
      
 222 
     | 
    
         
            +
                    if response.body.nil? || response.body.empty? || response.response.code.to_i != 200
         
     | 
| 
       223 
223 
     | 
    
         
             
                      self.repeat_puts(response.nil? ? 'null' : response.body)
         
     | 
| 
       224 
224 
     | 
    
         
             
                      self.recollect_garbage
         
     | 
| 
       225 
225 
     | 
    
         
             
                      return 0
         
     | 
    
        data/lib/datahen/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: datahen
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 1.4.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Parama Danoesubroto
         
     | 
| 
       8 
     | 
    
         
            -
            autorequire: 
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2023- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2023-11-01 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: thor
         
     | 
| 
         @@ -278,7 +278,7 @@ metadata: 
     | 
|
| 
       278 
278 
     | 
    
         
             
              allowed_push_host: https://rubygems.org
         
     | 
| 
       279 
279 
     | 
    
         
             
              homepage_uri: https://datahen.com
         
     | 
| 
       280 
280 
     | 
    
         
             
              source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
         
     | 
| 
       281 
     | 
    
         
            -
            post_install_message: 
     | 
| 
      
 281 
     | 
    
         
            +
            post_install_message:
         
     | 
| 
       282 
282 
     | 
    
         
             
            rdoc_options: []
         
     | 
| 
       283 
283 
     | 
    
         
             
            require_paths:
         
     | 
| 
       284 
284 
     | 
    
         
             
            - lib
         
     | 
| 
         @@ -293,8 +293,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       293 
293 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       294 
294 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       295 
295 
     | 
    
         
             
            requirements: []
         
     | 
| 
       296 
     | 
    
         
            -
            rubygems_version: 3. 
     | 
| 
       297 
     | 
    
         
            -
            signing_key: 
     | 
| 
      
 296 
     | 
    
         
            +
            rubygems_version: 3.1.4
         
     | 
| 
      
 297 
     | 
    
         
            +
            signing_key:
         
     | 
| 
       298 
298 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       299 
299 
     | 
    
         
             
            summary: DataHen toolbelt for developers
         
     | 
| 
       300 
300 
     | 
    
         
             
            test_files: []
         
     |