RubyGems - datahen - Versions diffs - 1.4.0 → 1.5.1 - Mend

datahen 1.4.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/lib/datahen/cli/job_output.rb +36 -6
data/lib/datahen/cli/scraper.rb +27 -2
data/lib/datahen/cli/scraper_job.rb +4 -0
data/lib/datahen/cli/scraper_page.rb +19 -2
data/lib/datahen/cli/scraper_task.rb +48 -0
data/lib/datahen/cli.rb +1 -0
data/lib/datahen/client/base.rb +15 -2
data/lib/datahen/client/job.rb +5 -1
data/lib/datahen/client/job_output.rb +1 -1
data/lib/datahen/client/job_page.rb +9 -0
data/lib/datahen/client/job_task.rb +17 -0
data/lib/datahen/client/scraper.rb +8 -0
data/lib/datahen/client/scraper_job.rb +8 -0
data/lib/datahen/client/scraper_task.rb +17 -0
data/lib/datahen/client.rb +2 -0
data/lib/datahen/scraper/batch_parser.rb +6 -2
data/lib/datahen/scraper/executor.rb +7 -2
data/lib/datahen/scraper/ruby_parser_executor.rb +10 -3
data/lib/datahen/version.rb +1 -1
metadata +9 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ae63999d11bc052d81e3b1de67a0741702dd980e719dd544b5e689f0383e7a34
-  data.tar.gz: faf53b662afa26409bff83c3007127211863ce33ff45b8c60aab56491fdcafe7
+  metadata.gz: 96d2bc30d1c96ce684d83efa54b6dff5966db2a1bba7ab4856b11caba2803086
+  data.tar.gz: 985712d5d7e6559ac64b76669241f56d704c754deb06a164e1f449aad10ef29e
 SHA512:
-  metadata.gz: b0e7a0ddc975202df66785211cee796e1aef61de921f99ba0481645f59fb65963c03517e1d4f5b471d2ed108087011f20d6b693d6d199e2e96c860412b675415
-  data.tar.gz: 70a13268ba6f3df8f560a1b4d65b261ec4744bbd71eb22b04c25dc5809e70af3541cc30d9dad0e4a4cc22c3a353b90bc24c4a43cfc70e7f82fc2080001596d38
+  metadata.gz: d9c6bd3e60034339a8354fe4bda365b91f21b6ec68da8f384d7380abcafa5ccce2c2aacd6cc7a8da37378b8681afe58765bcc461211812c623a8958eac7a5f72
+  data.tar.gz: ac5eb5c8de4e4b0a6d28d96179bab4bf347662247b94e775ed0a25e0f0ef00a542f01f8a1a06525b565e7bd1055d5cd30b480a28d28c7ebf5de893b89b9f5e3a

data/lib/datahen/cli/job_output.rb CHANGED Viewed

@@ -20,10 +20,20 @@ module Datahen
         collection = options.fetch(:collection) { 'default' }
         if options[:job]
           client = Client::JobOutput.new(options)
-          puts "#{client.all(options[:job], collection)}"
+          json = JSON.parse(client.all(options[:job], collection).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         else
           client = Client::ScraperJobOutput.new(options)
-          puts "#{client.all(scraper_name, collection)}"
+          json = JSON.parse(client.all(scraper_name, collection).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         end
       end
@@ -38,10 +48,20 @@ module Datahen
         collection = options.fetch(:collection) { 'default' }
         if options[:job]
           client = Client::JobOutput.new(options)
-          puts "#{client.find(options[:job], collection, id)}"
+          json = JSON.parse(client.find(options[:job], collection, id).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         else
           client = Client::ScraperJobOutput.new(options)
-          puts "#{client.find(scraper_name, collection, id)}"
+          json = JSON.parse(client.find(scraper_name, collection, id).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         end
       end
@@ -56,10 +76,20 @@ module Datahen
         if options[:job]
           client = Client::JobOutput.new(options)
-          puts "#{client.collections(options[:job])}"
+          json = JSON.parse(client.collections(options[:job]).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         else
           client = Client::ScraperJobOutput.new(options)
-          puts "#{client.collections(scraper_name)}"
+          json = JSON.parse(client.collections(scraper_name).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         end
       end

data/lib/datahen/cli/scraper.rb CHANGED Viewed

@@ -37,6 +37,10 @@ module Datahen
       option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
       option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
       option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
+      option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
+      option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
+      option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
+      option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
       def create(scraper_name, git_repository)
         # puts "options #{options}"
         client = Client::Scraper.new(options)
@@ -66,6 +70,10 @@ module Datahen
       option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
       option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
       option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
+      option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
+      option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
+      option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
+      option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
       def update(scraper_name)
         client = Client::Scraper.new(options)
         puts "#{client.update(scraper_name, options)}"
@@ -106,6 +114,10 @@ module Datahen
       option :vars, type: :string, banner: :JSON, desc: 'Set input vars. Must be in json format. i.e: [{"name":"foo", "value":"bar", "secret":false}] '
       option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
       option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
+      option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
+      option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
+      option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
+      option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
       def start(scraper_name)
         client = Client::ScraperJob.new(options)
         puts "Starting a scrape job..."
@@ -188,9 +200,19 @@ module Datahen
       def history(scraper_name)
         client = Client::JobStat.new(options)
         if options[:job]
-          puts "#{client.job_stats_history(options[:job], options)}"
+          json = JSON.parse(client.job_stats_history(options[:job], options).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         else
-          puts "#{client.scraper_job_stats_history(scraper_name, options)}"
+          json = JSON.parse(client.scraper_job_stats_history(scraper_name, options).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         end
       end
@@ -227,6 +249,9 @@ module Datahen
       desc "var SUBCOMMAND ...ARGS", "for managing scraper's variables"
       subcommand "var", ScraperVar
+      desc "task SUBCOMMAND ...ARGS", "manage task on a job"
+      subcommand "task", ScraperTask
     end
   end

data/lib/datahen/cli/scraper_job.rb CHANGED Viewed

@@ -108,6 +108,10 @@ module Datahen
       option :max_page_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
       option :enable_global_cache, type: :boolean, desc: 'Set true to enable page cache. Default: false'
       option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
+      option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
+      option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
+      option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
+      option :prevent_kb_autoscaler, type: :boolean, desc: 'Set true to prevent the autoscaler from restarting the job. Default: false'
       def update(scraper_name)
         if options[:job]
           client = Client::Job.new(options)

data/lib/datahen/cli/scraper_page.rb CHANGED Viewed

@@ -13,6 +13,10 @@ module Datahen
       LONGDESC
       option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
       option :page_type, :aliases => :t, type: :string, desc: 'Filter by page_type'
+      option :url, :aliases => :u, type: :string, desc: 'Filter by url'
+      option :effective_url, :aliases => :U, type: :string, desc: 'Filter by effective_url'
+      option :body, :aliases => :b, type: :string, desc: 'Filter by body'
+      option :parent_gid, :aliases => :G, type: :string, desc: 'Filter by parent_gid'
       option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
       option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
       option :fetch_fail, type: :boolean, desc: 'Returns only pages that fails fetching.'
@@ -21,10 +25,20 @@ module Datahen
       def list(scraper_name)
         if options[:job]
           client = Client::JobPage.new(options)
-          puts "#{client.all(options[:job])}"
+          json = JSON.parse(client.all(options[:job]).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         else
           client = Client::ScraperJobPage.new(options)
-          puts "#{client.all(scraper_name)}"
+          json = JSON.parse(client.all(scraper_name).body)
+          if json['error'] == ""
+            puts "#{JSON.pretty_generate(json['data'])}"
+          else
+            puts "#{JSON.pretty_generate(json['error'])}"
+          end
         end
       end
@@ -84,6 +98,9 @@ module Datahen
       option :vars, :aliases => :v, type: :string, desc: 'Set user-defined page variables. Must be in json format. i.e: {"Foo":"bar"}'
       option :max_size, type: :numeric, desc: 'Set a value to set max page size when fetching a page. Set a value grather than 0 to set it as limit, 0 means any size. Default: 0'
       option :retry_interval, type: :numeric, desc: 'Set a value to set retry time interval on seconds when refetching a page. Set a value grather than 0 to set it as new time to refetch, 0 means default time. Default: 0'
+      option :soft_fetching_try_limit, type: :numeric, desc: 'Set the soft fetching try limit value.'
+      option :soft_refetch_limit, type: :numeric, desc: 'Set the soft refetch limit value.'
+      option :parsing_try_limit, type: :numeric, desc: 'Set the parsing try limit value.'
       def update(scraper_name, gid)
         begin
           options[:vars] = JSON.parse(options[:vars]) if options[:vars]

data/lib/datahen/cli/scraper_task.rb ADDED Viewed

@@ -0,0 +1,48 @@
+module Datahen
+  class CLI < Thor
+    class ScraperTask < Thor
+      package_name "scraper task"
+      def self.banner(command, namespace = nil, subcommand = false)
+        "#{basename} #{@package_name} #{command.usage}"
+      end
+      desc "list <scraper_name>", "List Tasks on a scraper's current job"
+      long_desc <<-LONGDESC
+        List all tasks in a scraper's current job or given job ID.\x5
+      LONGDESC
+      option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
+      option :page, :aliases => :p, type: :numeric, desc: 'Get the next set of records by page.'
+      option :per_page, :aliases => :P, type: :numeric, desc: 'Number of records per page. Max 500 per page.'
+      option :status, type: :array, desc: 'Returns only tasks with specific status.'
+      option :action, type: :array, desc: 'Returns only tasks with specific action.'
+      option :"include-system", type: :boolean, desc: 'If it is true, will returns all actions. If it is false only tasks with specific action ["refetch", "reparse", "terminate"].'
+      def list(scraper_name)
+        if options[:job]
+          client = Client::JobTask.new(options)
+          puts "#{client.all(options[:job])}"
+        else
+          client = Client::ScraperTask.new(options)
+          puts "#{client.all(scraper_name)}"
+        end
+      end
+      desc "show <scraper_name> <task_id>", "Show task in scraper's current job"
+      long_desc <<-LONGDESC
+          Shows a task in a scraper's current job or given job ID.\x5
+          LONGDESC
+      option :job, :aliases => :j, type: :numeric, desc: 'Set a specific job ID'
+      def show(scraper_name, task_id)
+        if options[:job]
+          client = Client::JobTask.new(options)
+          puts "#{client.find(options[:job], task_id)}"
+        else
+          client = Client::ScraperTask.new(options)
+          puts "#{client.find(scraper_name, task_id)}"
+        end
+      end
+    end
+  end
+end

data/lib/datahen/cli.rb CHANGED Viewed

@@ -11,6 +11,7 @@ require 'datahen/cli/scraper_page'
 require 'datahen/cli/job_output'
 require 'datahen/cli/job'
 require 'datahen/cli/scraper_deployment'
+require 'datahen/cli/scraper_task'
 require 'datahen/cli/scraper'
 require 'datahen/cli/parser'
 require 'datahen/cli/seeder'

data/lib/datahen/client/base.rb CHANGED Viewed

@@ -56,12 +56,18 @@ module Datahen
         target.merge(source.select{|k,v|target.has_key?(k)})
       end
-      def retry times, delay = nil, err_msg = nil
+      def retry times, delay = nil, err_msg = nil, stream = false
         limit = times.nil? ? nil : times.to_i
         delay = delay.nil? ? 5 : delay.to_i
         count = 0
         begin
-          yield
+          val = yield
+          if stream
+            return if val.nil?
+            if val['error'] != ""
+              raise StandardError.new(val['error'])
+            end
+          end
         rescue Error::CustomRetryError, StandardError => e
           is_custom_retry = e.is_a? Error::CustomRetryError
           real_delay = is_custom_retry ? e.delay : delay
@@ -81,6 +87,7 @@ module Datahen
           puts "#{err_msg.nil? ? '' : "#{err_msg} "}Retry \##{count}#{should_aprox ? '+' : ''}..."
           retry
         end
+        val
       end
       def initialize(opts={})
@@ -105,6 +112,10 @@ module Datahen
         query[:parsefail] = opts[:parse_fail] if opts[:parse_fail]
         query[:status] = opts[:status] if opts[:status]
         query[:page_type] = opts[:page_type] if opts[:page_type]
+        query[:url] = opts[:url] if opts[:url]
+        query[:effective_url] = opts[:effective_url] if opts[:effective_url]
+        query[:body] = opts[:body] if opts[:body]
+        query[:parent_gid] = opts[:parent_gid] if opts[:parent_gid]
         query[:gid] = opts[:gid] if opts[:gid]
         query[:"min-timestamp"] = opts[:"min-timestamp"] if opts[:"min-timestamp"]
         query[:"max-timestamp"] = opts[:"max-timestamp"] if opts[:"max-timestamp"]
@@ -112,6 +123,8 @@ module Datahen
         query[:order] = opts[:order] if opts[:order]
         query[:filter] = opts[:filter] if opts[:filter]
         query[:force] = opts[:force] if opts[:force]
+        query[:action] = opts[:action] if opts[:action]
+        query[:"include-system"] = opts[:"include-system"] if opts[:"include-system"]
         if opts[:query]
           if opts[:query].is_a?(Hash)

data/lib/datahen/client/job.rb CHANGED Viewed

@@ -25,6 +25,10 @@ module Datahen
         body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
         body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
         body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
+        body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
+        body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
+        body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
+        body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
         params = @options.merge({body: body.to_json})
         self.class.put("/jobs/#{job_id}", params)
@@ -97,7 +101,7 @@ module Datahen
       def sync_schema(job_id, opts={})
         params = @options.merge(opts)
-        self.class.put("/sync/jobs/#{job_id}/schema", params)
+        self.class.put("/jobs/#{job_id}/sync/schema", params)
       end
     end

data/lib/datahen/client/job_output.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module Datahen
       def all(job_id, collection = 'default', opts = {})
         limit = opts.has_key?(:retry_limit) ? opts.fetch(:retry_limit) : 0
-        self.retry(limit, 10, "Error while updating the seeder.") do
+        self.retry(limit, 10, "Error while updating the seeder.", true) do
           self.class.get("/jobs/#{job_id}/output/collections/#{collection}/records", @options)
         end
       end

data/lib/datahen/client/job_page.rb CHANGED Viewed

@@ -18,6 +18,9 @@ module Datahen
         body[:max_size] = opts[:max_size] if opts[:max_size]
         body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
         body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
+        body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
+        body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
+        body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
         params = @options.merge({body: body.to_json})
@@ -55,6 +58,7 @@ module Datahen
         body[:parsing_status] = opts.fetch(:parsing_status){ nil }
         body[:log_error] = opts[:log_error] if opts[:log_error]
         body[:keep_outputs] = !!opts[:keep_outputs] if opts.has_key?(:keep_outputs)
+        body[:parsing_try_limit] = opts[:parsing_try_limit] if opts.fetch(:parsing_try_limit){ nil }
         params = @options.merge({body: body.to_json})
@@ -90,6 +94,11 @@ module Datahen
         params = @options.merge(opts)
         self.class.put("/jobs/#{job_id}/pages/limbo", params)
       end
+      def still_alive(job_id, gid, opts={})
+        params = @options.merge(opts)
+        self.class.put("/jobs/#{job_id}/pages/#{gid}/still_alive", params)
+      end
     end
   end
 end

data/lib/datahen/client/job_task.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Datahen
+  module Client
+    class JobTask < Datahen::Client::Base
+      def all(job_id, opts={})
+        params = @options.merge(opts)
+        self.class.get("/jobs/#{job_id}/tasks", params)
+      end
+      def find(job_id, task_id, opts={})
+        params = @options.merge(opts)
+        self.class.get("/jobs/#{job_id}/tasks/#{task_id}", params)
+      end
+    end
+  end
+end

data/lib/datahen/client/scraper.rb CHANGED Viewed

@@ -32,6 +32,10 @@ module Datahen
         body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
         body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
         body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
+        body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
+        body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
+        body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
+        body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
         params = @options.merge({body: body.to_json})
         self.class.post("/scrapers", params)
       end
@@ -57,6 +61,10 @@ module Datahen
         body[:max_page_size] = opts[:max_page_size] if opts.has_key?("max_page_size") || opts.has_key?(:max_page_size)
         body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
         body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
+        body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
+        body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
+        body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
+        body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
         params = @options.merge({body: body.to_json})
         self.class.put("/scrapers/#{scraper_name}", params)

data/lib/datahen/client/scraper_job.rb CHANGED Viewed

@@ -15,6 +15,10 @@ module Datahen
         body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
         body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
         body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
+        body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
+        body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
+        body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
+        body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
         if opts[:vars]
           if opts[:vars].is_a?(Array)
             body[:vars] = opts[:vars]
@@ -45,6 +49,10 @@ module Datahen
         body[:max_page_size] = opts[:max_page_size] if opts[:max_page_size]
         body[:enable_global_cache] = opts[:enable_global_cache] if opts.has_key?("enable_global_cache") || opts.has_key?(:enable_global_cache)
         body[:retry_interval] = opts[:retry_interval] if opts[:retry_interval]
+        body[:soft_fetching_try_limit] = opts[:soft_fetching_try_limit] if opts[:soft_fetching_try_limit]
+        body[:soft_refetch_limit] = opts[:soft_refetch_limit] if opts[:soft_refetch_limit]
+        body[:parsing_try_limit] = opts[:parsing_try_limit] if opts[:parsing_try_limit]
+        body[:prevent_kb_autoscaler] = opts[:prevent_kb_autoscaler] if opts.has_key?("prevent_kb_autoscaler") || opts.has_key?(:prevent_kb_autoscaler)
         params = @options.merge({body: body.to_json})
         self.class.put("/scrapers/#{scraper_name}/current_job", params)

data/lib/datahen/client/scraper_task.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Datahen
+  module Client
+    class ScraperTask < Datahen::Client::Base
+      def all(scraper_name, opts={})
+        params = @options.merge(opts)
+        self.class.get("/scrapers/#{scraper_name}/current_job/tasks", params)
+      end
+      def find(scraper_name, task_id, opts={})
+        params = @options.merge(opts)
+        self.class.get("/scrapers/#{scraper_name}/current_job/tasks/#{task_id}", params)
+      end
+    end
+  end
+end

data/lib/datahen/client.rb CHANGED Viewed

@@ -24,6 +24,8 @@ require "datahen/client/scraper_var"
 require "datahen/client/job_var"
 require "datahen/client/scraper_job_var"
 require "datahen/client/job_finisher"
+require "datahen/client/job_task"
+require "datahen/client/scraper_task"
 module Datahen
   module Client

data/lib/datahen/scraper/batch_parser.rb CHANGED Viewed

@@ -227,7 +227,11 @@ module Datahen
         # add pages
         count = 0
-        (JSON.parse(response.body) || []).each do |page|
+        json = JSON.parse(response.body)
+          if json['error'] != ""
+            return 0
+          end
+        (json['data'] || []).each do |page|
           count += 1
           next if self.loaded_pages.has_key? page['gid']
           self.pages << (self.loaded_pages[page['gid']] = page)
@@ -307,7 +311,7 @@ module Datahen
             is_waiting = true
             puts "[Worker #{Parallel.worker_number}]: Is waiting for a page..."
             if self.second_dequeue_count > 1 && !self.not_found
-              puts "\nWARNING: Your job is not optimized, increase your job's \"parser_dequeue_scale\"\n"
+              puts "\nWARNING: Your job might not be optimized. Consider increasing your job's \"parser_dequeue_scale\" if the `to_parse` queue is not empty or near empty \n"
             end
           end
           self.class.wait 1

data/lib/datahen/scraper/executor.rb CHANGED Viewed

@@ -172,11 +172,16 @@ module Datahen
         response = client.all(query_job_id, collection, {
           retry_limit: retry_limit
         })
         if response.code != 200
           raise "response_code: #{response.code}|#{response.parsed_response}"
         end
-        (response.body != 'null') ? response.parsed_response : []
+        # check stream error
+        json_data = response.body != 'null' ? response.parsed_response : {}
+        if json_data['error'] != ""
+          raise "response_code: #{response.code}|Stream error: #{json_data['error']}"
+        end
+        json_data['data'].nil? ? [] : json_data['data']
       end
       # Find one output by collection and query with pagination.

data/lib/datahen/scraper/ruby_parser_executor.rb CHANGED Viewed

@@ -33,7 +33,6 @@ module Datahen
           :failed_content,
           :outputs,
           :pages,
-          :page,
           :save_pages,
           :save_outputs,
           :find_output,
@@ -41,7 +40,8 @@ module Datahen
           :refetch,
           :reparse,
           :limbo,
-          :finish
+          :finish,
+          :still_alive
         ].freeze
       end
@@ -240,6 +240,12 @@ module Datahen
         @failed_content ||= get_failed_content(job_id, gid)
       end
+      def still_alive page_gid = nil
+        page_gid = gid if page_gid.nil?
+        client = Client::JobPage.new()
+        client.still_alive(job_id, page_gid)
+      end
       def handle_error(e)
         error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
@@ -247,7 +253,8 @@ module Datahen
           job_id: job_id,
           gid: gid,
           parsing_status: :failed,
-          log_error: error)
+          log_error: error,
+          parsing_try_limit: (page || {})['parsing_try_limit'])
       end
     end

data/lib/datahen/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Datahen
-  VERSION = "1.4.0"
+  VERSION = "1.5.1"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: datahen
 version: !ruby/object:Gem::Version
-  version: 1.4.0
+  version: 1.5.1
 platform: ruby
 authors:
 - Parama Danoesubroto
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-11-01 00:00:00.000000000 Z
+date: 2024-01-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: thor
@@ -227,6 +227,7 @@ files:
 - lib/datahen/cli/scraper_job.rb
 - lib/datahen/cli/scraper_job_var.rb
 - lib/datahen/cli/scraper_page.rb
+- lib/datahen/cli/scraper_task.rb
 - lib/datahen/cli/scraper_var.rb
 - lib/datahen/cli/seeder.rb
 - lib/datahen/client.rb
@@ -245,6 +246,7 @@ files:
 - lib/datahen/client/job_output.rb
 - lib/datahen/client/job_page.rb
 - lib/datahen/client/job_stat.rb
+- lib/datahen/client/job_task.rb
 - lib/datahen/client/job_var.rb
 - lib/datahen/client/scraper.rb
 - lib/datahen/client/scraper_deployment.rb
@@ -255,6 +257,7 @@ files:
 - lib/datahen/client/scraper_job_output.rb
 - lib/datahen/client/scraper_job_page.rb
 - lib/datahen/client/scraper_job_var.rb
+- lib/datahen/client/scraper_task.rb
 - lib/datahen/client/scraper_var.rb
 - lib/datahen/error.rb
 - lib/datahen/error/custom_retry_error.rb
@@ -278,7 +281,7 @@ metadata:
   allowed_push_host: https://rubygems.org
   homepage_uri: https://datahen.com
   source_code_uri: https://github.com/DataHenOfficial/datahen-ruby
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -293,8 +296,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.1.4
-signing_key:
+rubygems_version: 3.0.3
+signing_key:
 specification_version: 4
 summary: DataHen toolbelt for developers
 test_files: []