datahen 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,18 @@
1
+ module Datahen
2
+ module Scraper
3
+ class Finisher
4
+
5
+ def self.exec_finisher(filename, job_id=nil, save=false)
6
+ extname = File.extname(filename)
7
+ case extname
8
+ when '.rb'
9
+ executor = RubyFinisherExecutor.new(filename: filename, job_id: job_id)
10
+ executor.exec_finisher(save)
11
+ else
12
+ puts "Unable to find a finisher executor for file type \"#{extname}\""
13
+ end
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module Datahen
2
+ module Scraper
3
+ class Parser
4
+ def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
5
+ extname = File.extname(filename)
6
+ case extname
7
+ when '.rb'
8
+ executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
9
+ executor.exec_parser(save)
10
+ else
11
+ puts "Unable to find a parser executor for file type \"#{extname}\""
12
+ end
13
+ end
14
+
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,116 @@
1
+ module Datahen
2
+ module Scraper
3
+ class RubyFinisherExecutor < Executor
4
+ attr_accessor :save
5
+
6
+ def initialize(options={})
7
+ @filename = options.fetch(:filename) { raise "Filename is required"}
8
+ @job_id = options[:job_id]
9
+ end
10
+
11
+ def self.exposed_methods
12
+ [
13
+ :outputs,
14
+ :save_outputs,
15
+ :find_output,
16
+ :find_outputs
17
+ ].freeze
18
+ end
19
+
20
+ def exec_finisher(save=false)
21
+ @save = save
22
+ if save
23
+ puts "Executing finisher script"
24
+ else
25
+ puts "Trying finisher script"
26
+ end
27
+
28
+ eval_finisher_script(save)
29
+ end
30
+
31
+ def eval_finisher_script(save=false)
32
+ update_finisher_starting_status
33
+
34
+ proc = Proc.new do
35
+ outputs = []
36
+
37
+ begin
38
+ context = isolated_binding({
39
+ outputs: outputs,
40
+ job_id: job_id
41
+ })
42
+ eval_with_context filename, context
43
+ rescue SyntaxError => e
44
+ handle_error(e) if save
45
+ raise e
46
+ rescue => e
47
+ handle_error(e) if save
48
+ raise e
49
+ end
50
+
51
+ puts "=========== Finisher Executed ==========="
52
+ begin
53
+ save_outputs(outputs)
54
+ rescue => e
55
+ handle_error(e) if save
56
+ raise e
57
+ end
58
+
59
+ update_finisher_done_status
60
+ end
61
+ proc.call
62
+ end
63
+
64
+ def save_type
65
+ :executing
66
+ end
67
+
68
+ def update_to_server(opts = {})
69
+ finisher_update(
70
+ job_id: opts[:job_id],
71
+ outputs: opts[:outputs],
72
+ finisher_status: opts[:status])
73
+ end
74
+
75
+ def update_finisher_starting_status
76
+ return unless save
77
+
78
+ response = finisher_update(
79
+ job_id: job_id,
80
+ finisher_status: :starting)
81
+
82
+ if response.code == 200
83
+ puts "Finisher Status Updated."
84
+ else
85
+ puts "Error: Unable to save Finisher Status to server: #{response.body}"
86
+ raise "Unable to save Finisher Status to server: #{response.body}"
87
+ end
88
+ end
89
+
90
+ def update_finisher_done_status
91
+ return unless save
92
+
93
+ response = finisher_update(
94
+ job_id: job_id,
95
+ finisher_status: :done)
96
+
97
+ if response.code == 200
98
+ puts "Finisher Done."
99
+ else
100
+ puts "Error: Unable to save Finisher Done Status to server: #{response.body}"
101
+ raise "Unable to save Finisher Done Status to server: #{response.body}"
102
+ end
103
+ end
104
+
105
+ def handle_error(e)
106
+ error = ["Finisher #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
107
+
108
+ finisher_update(
109
+ job_id: job_id,
110
+ finisher_status: :failed,
111
+ log_error: error)
112
+ end
113
+
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,200 @@
1
+ module Datahen
2
+ module Scraper
3
+ class RubyParserExecutor < Executor
4
+ attr_accessor :save
5
+ # Refetch self page flag.
6
+ # @return [Boollean]
7
+ # @note It is stronger than #reparse_self flag.
8
+ attr_accessor :refetch_self
9
+ # Reparse self page flag.
10
+ # @return [Boollean]
11
+ attr_accessor :reparse_self
12
+
13
+ def initialize(options={})
14
+ @filename = options.fetch(:filename) { raise "Filename is required"}
15
+ @gid = options.fetch(:gid) { raise "GID is required"}
16
+ @job_id = options.fetch(:job_id)
17
+ @page_vars = options.fetch(:vars) { {} }
18
+ end
19
+
20
+ def self.exposed_methods
21
+ [
22
+ :content,
23
+ :failed_content,
24
+ :outputs,
25
+ :pages,
26
+ :page,
27
+ :save_pages,
28
+ :save_outputs,
29
+ :find_output,
30
+ :find_outputs,
31
+ :refetch,
32
+ :reparse
33
+ ].freeze
34
+ end
35
+
36
+ def exec_parser(save=false)
37
+ @save = save
38
+ if save
39
+ puts "Executing parser script"
40
+ else
41
+ puts "Trying parser script"
42
+ end
43
+
44
+ eval_parser_script(save)
45
+ end
46
+
47
+ def init_page_vars(page)
48
+ if !@page_vars.nil? && !@page_vars.empty?
49
+ page['vars'] = @page_vars
50
+ end
51
+ page
52
+ end
53
+
54
+ def update_to_server(opts = {})
55
+ parsing_update(
56
+ job_id: opts[:job_id],
57
+ gid: opts[:gid],
58
+ pages: opts[:pages],
59
+ outputs: opts[:outputs],
60
+ parsing_status: opts[:status])
61
+ end
62
+
63
+ def update_parsing_starting_status
64
+ return unless save
65
+
66
+ response = parsing_update(
67
+ job_id: job_id,
68
+ gid: gid,
69
+ parsing_status: :starting)
70
+
71
+ if response.code == 200
72
+ puts "Page Parsing Status Updated."
73
+ else
74
+ puts "Error: Unable to save Page Parsing Status to server: #{response.body}"
75
+ raise "Unable to save Page Parsing Status to server: #{response.body}"
76
+ end
77
+ end
78
+
79
+ def update_parsing_done_status
80
+ return unless save
81
+
82
+ response = parsing_update(
83
+ job_id: job_id,
84
+ gid: gid,
85
+ parsing_status: :done)
86
+
87
+ if response.code == 200
88
+ puts "Page Parsing Done."
89
+ else
90
+ puts "Error: Unable to save Page Parsing Done Status to server: #{response.body}"
91
+ raise "Unable to save Page Parsing Done Status to server: #{response.body}"
92
+ end
93
+ end
94
+
95
+ def save_type
96
+ :parsing
97
+ end
98
+
99
+ def refetch_page gid
100
+ if save
101
+ Client::ScraperJobPage.new({gid: gid}).refetch_by_job(self.job_id)
102
+ puts "Refetch page #{gid}"
103
+ else
104
+ puts "Would have refetch page #{gid}"
105
+ end
106
+ end
107
+
108
+ def refetch page_gid
109
+ raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
110
+ if page_gid == gid
111
+ self.refetch_self = true
112
+ return
113
+ end
114
+ refetch_page page_gid
115
+ end
116
+
117
+ def reparse_page gid
118
+ if save
119
+ Client::ScraperJobPage.new({gid: gid}).reparse_by_job(self.job_id)
120
+ puts "Reparse page #{gid}"
121
+ else
122
+ puts "Would have reparse page #{gid}"
123
+ end
124
+ end
125
+
126
+ def reparse page_gid
127
+ raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
128
+ if page_gid == gid
129
+ self.reparse_self = true
130
+ return
131
+ end
132
+ reparse_page page_gid
133
+ end
134
+
135
+ def eval_parser_script(save=false)
136
+ update_parsing_starting_status
137
+
138
+ proc = Proc.new do
139
+ page = init_page
140
+ outputs = []
141
+ pages = []
142
+ page = init_page_vars(page)
143
+ self.refetch_self = false
144
+ self.reparse_self = false
145
+
146
+ begin
147
+ context = isolated_binding({
148
+ outputs: outputs,
149
+ pages: pages,
150
+ page: page
151
+ })
152
+ eval_with_context filename, context
153
+ rescue SyntaxError => e
154
+ handle_error(e) if save
155
+ raise e
156
+ rescue => e
157
+ handle_error(e) if save
158
+ raise e
159
+ end
160
+
161
+ puts "=========== Parsing Executed ==========="
162
+ begin
163
+ save_pages_and_outputs(pages, outputs, :parsing)
164
+ rescue => e
165
+ handle_error(e) if save
166
+ raise e
167
+ end
168
+
169
+ if refetch_self
170
+ refetch_page gid
171
+ elsif reparse_self
172
+ reparse_page gid
173
+ else
174
+ update_parsing_done_status
175
+ end
176
+ end
177
+ proc.call
178
+ end
179
+
180
+ def content
181
+ @content ||= get_content(gid)
182
+ end
183
+
184
+ def failed_content
185
+ @failed_content ||= get_failed_content(gid)
186
+ end
187
+
188
+ def handle_error(e)
189
+ error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
190
+
191
+ parsing_update(
192
+ job_id: job_id,
193
+ gid: gid,
194
+ parsing_status: :failed,
195
+ log_error: error)
196
+ end
197
+
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,120 @@
1
+ module Datahen
2
+ module Scraper
3
+ class RubySeederExecutor < Executor
4
+ attr_accessor :save
5
+
6
+ def initialize(options={})
7
+ @filename = options.fetch(:filename) { raise "Filename is required"}
8
+ @job_id = options[:job_id]
9
+ end
10
+
11
+ def self.exposed_methods
12
+ [
13
+ :outputs,
14
+ :pages,
15
+ :save_pages,
16
+ :save_outputs,
17
+ :find_output,
18
+ :find_outputs
19
+ ].freeze
20
+ end
21
+
22
+ def exec_seeder(save=false)
23
+ @save = save
24
+ if save
25
+ puts "Executing seeder script"
26
+ else
27
+ puts "Trying seeder script"
28
+ end
29
+
30
+ eval_seeder_script(save)
31
+ end
32
+
33
+ def eval_seeder_script(save=false)
34
+ update_seeding_starting_status
35
+
36
+ proc = Proc.new do
37
+ outputs = []
38
+ pages = []
39
+
40
+ begin
41
+ context = isolated_binding({
42
+ outputs: outputs,
43
+ pages: pages
44
+ })
45
+ eval_with_context filename, context
46
+ rescue SyntaxError => e
47
+ handle_error(e) if save
48
+ raise e
49
+ rescue => e
50
+ handle_error(e) if save
51
+ raise e
52
+ end
53
+
54
+ puts "=========== Seeding Executed ==========="
55
+ begin
56
+ save_pages_and_outputs(pages, outputs, :seeding)
57
+ rescue => e
58
+ handle_error(e) if save
59
+ raise e
60
+ end
61
+
62
+ update_seeding_done_status
63
+ end
64
+ proc.call
65
+ end
66
+
67
+ def save_type
68
+ :seeding
69
+ end
70
+
71
+ def update_to_server(opts = {})
72
+ seeding_update(
73
+ job_id: opts[:job_id],
74
+ pages: opts[:pages],
75
+ outputs: opts[:outputs],
76
+ seeding_status: opts[:status])
77
+ end
78
+
79
+ def update_seeding_starting_status
80
+ return unless save
81
+
82
+ response = seeding_update(
83
+ job_id: job_id,
84
+ seeding_status: :starting)
85
+
86
+ if response.code == 200
87
+ puts "Seeding Status Updated."
88
+ else
89
+ puts "Error: Unable to save Seeding Status to server: #{response.body}"
90
+ raise "Unable to save Seeding Status to server: #{response.body}"
91
+ end
92
+ end
93
+
94
+ def update_seeding_done_status
95
+ return unless save
96
+
97
+ response = seeding_update(
98
+ job_id: job_id,
99
+ seeding_status: :done)
100
+
101
+ if response.code == 200
102
+ puts "Seeding Done."
103
+ else
104
+ puts "Error: Unable to save Seeding Done Status to server: #{response.body}"
105
+ raise "Unable to save Seeding Done Status to server: #{response.body}"
106
+ end
107
+ end
108
+
109
+ def handle_error(e)
110
+ error = ["Seeding #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
111
+
112
+ seeding_update(
113
+ job_id: job_id,
114
+ seeding_status: :failed,
115
+ log_error: error)
116
+ end
117
+
118
+ end
119
+ end
120
+ end