datahen 0.10.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +29 -0
  8. data/Rakefile +22 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/datahen.gemspec +47 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/hen +3 -0
  25. data/lib/datahen.rb +5 -0
  26. data/lib/datahen/cli.rb +45 -0
  27. data/lib/datahen/cli/env_var.rb +48 -0
  28. data/lib/datahen/cli/finisher.rb +40 -0
  29. data/lib/datahen/cli/global_page.rb +39 -0
  30. data/lib/datahen/cli/job.rb +30 -0
  31. data/lib/datahen/cli/job_output.rb +69 -0
  32. data/lib/datahen/cli/parser.rb +64 -0
  33. data/lib/datahen/cli/scraper.rb +185 -0
  34. data/lib/datahen/cli/scraper_deployment.rb +24 -0
  35. data/lib/datahen/cli/scraper_export.rb +51 -0
  36. data/lib/datahen/cli/scraper_exporter.rb +40 -0
  37. data/lib/datahen/cli/scraper_finisher.rb +20 -0
  38. data/lib/datahen/cli/scraper_job.rb +75 -0
  39. data/lib/datahen/cli/scraper_job_var.rb +48 -0
  40. data/lib/datahen/cli/scraper_page.rb +203 -0
  41. data/lib/datahen/cli/scraper_var.rb +48 -0
  42. data/lib/datahen/cli/seeder.rb +40 -0
  43. data/lib/datahen/client.rb +29 -0
  44. data/lib/datahen/client/auth_token.rb +50 -0
  45. data/lib/datahen/client/backblaze_content.rb +45 -0
  46. data/lib/datahen/client/base.rb +69 -0
  47. data/lib/datahen/client/deploy_key.rb +21 -0
  48. data/lib/datahen/client/env_var.rb +28 -0
  49. data/lib/datahen/client/export.rb +10 -0
  50. data/lib/datahen/client/global_page.rb +18 -0
  51. data/lib/datahen/client/job.rb +64 -0
  52. data/lib/datahen/client/job_export.rb +10 -0
  53. data/lib/datahen/client/job_log.rb +26 -0
  54. data/lib/datahen/client/job_output.rb +19 -0
  55. data/lib/datahen/client/job_page.rb +58 -0
  56. data/lib/datahen/client/job_stat.rb +16 -0
  57. data/lib/datahen/client/scraper.rb +57 -0
  58. data/lib/datahen/client/scraper_deployment.rb +18 -0
  59. data/lib/datahen/client/scraper_export.rb +22 -0
  60. data/lib/datahen/client/scraper_exporter.rb +14 -0
  61. data/lib/datahen/client/scraper_finisher.rb +16 -0
  62. data/lib/datahen/client/scraper_job.rb +49 -0
  63. data/lib/datahen/client/scraper_job_output.rb +19 -0
  64. data/lib/datahen/client/scraper_job_page.rb +67 -0
  65. data/lib/datahen/client/scraper_job_var.rb +28 -0
  66. data/lib/datahen/client/scraper_var.rb +28 -0
  67. data/lib/datahen/plugin.rb +6 -0
  68. data/lib/datahen/plugin/context_exposer.rb +55 -0
  69. data/lib/datahen/scraper.rb +18 -0
  70. data/lib/datahen/scraper/executor.rb +373 -0
  71. data/lib/datahen/scraper/finisher.rb +18 -0
  72. data/lib/datahen/scraper/parser.rb +18 -0
  73. data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
  74. data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
  75. data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
  76. data/lib/datahen/scraper/seeder.rb +18 -0
  77. data/lib/datahen/version.rb +3 -0
  78. metadata +270 -0
@@ -0,0 +1,18 @@
1
+ module Datahen
2
+ module Scraper
3
+ class Finisher
4
+
5
+ def self.exec_finisher(filename, job_id=nil, save=false)
6
+ extname = File.extname(filename)
7
+ case extname
8
+ when '.rb'
9
+ executor = RubyFinisherExecutor.new(filename: filename, job_id: job_id)
10
+ executor.exec_finisher(save)
11
+ else
12
+ puts "Unable to find a finisher executor for file type \"#{extname}\""
13
+ end
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module Datahen
2
+ module Scraper
3
+ class Parser
4
+ def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
5
+ extname = File.extname(filename)
6
+ case extname
7
+ when '.rb'
8
+ executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
9
+ executor.exec_parser(save)
10
+ else
11
+ puts "Unable to find a parser executor for file type \"#{extname}\""
12
+ end
13
+ end
14
+
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,116 @@
1
+ module Datahen
2
+ module Scraper
3
+ class RubyFinisherExecutor < Executor
4
+ attr_accessor :save
5
+
6
+ def initialize(options={})
7
+ @filename = options.fetch(:filename) { raise "Filename is required"}
8
+ @job_id = options[:job_id]
9
+ end
10
+
11
+ def self.exposed_methods
12
+ [
13
+ :outputs,
14
+ :save_outputs,
15
+ :find_output,
16
+ :find_outputs
17
+ ].freeze
18
+ end
19
+
20
+ def exec_finisher(save=false)
21
+ @save = save
22
+ if save
23
+ puts "Executing finisher script"
24
+ else
25
+ puts "Trying finisher script"
26
+ end
27
+
28
+ eval_finisher_script(save)
29
+ end
30
+
31
+ def eval_finisher_script(save=false)
32
+ update_finisher_starting_status
33
+
34
+ proc = Proc.new do
35
+ outputs = []
36
+
37
+ begin
38
+ context = isolated_binding({
39
+ outputs: outputs,
40
+ job_id: job_id
41
+ })
42
+ eval_with_context filename, context
43
+ rescue SyntaxError => e
44
+ handle_error(e) if save
45
+ raise e
46
+ rescue => e
47
+ handle_error(e) if save
48
+ raise e
49
+ end
50
+
51
+ puts "=========== Finisher Executed ==========="
52
+ begin
53
+ save_outputs(outputs)
54
+ rescue => e
55
+ handle_error(e) if save
56
+ raise e
57
+ end
58
+
59
+ update_finisher_done_status
60
+ end
61
+ proc.call
62
+ end
63
+
64
+ def save_type
65
+ :executing
66
+ end
67
+
68
+ def update_to_server(opts = {})
69
+ finisher_update(
70
+ job_id: opts[:job_id],
71
+ outputs: opts[:outputs],
72
+ finisher_status: opts[:status])
73
+ end
74
+
75
+ def update_finisher_starting_status
76
+ return unless save
77
+
78
+ response = finisher_update(
79
+ job_id: job_id,
80
+ finisher_status: :starting)
81
+
82
+ if response.code == 200
83
+ puts "Finisher Status Updated."
84
+ else
85
+ puts "Error: Unable to save Finisher Status to server: #{response.body}"
86
+ raise "Unable to save Finisher Status to server: #{response.body}"
87
+ end
88
+ end
89
+
90
+ def update_finisher_done_status
91
+ return unless save
92
+
93
+ response = finisher_update(
94
+ job_id: job_id,
95
+ finisher_status: :done)
96
+
97
+ if response.code == 200
98
+ puts "Finisher Done."
99
+ else
100
+ puts "Error: Unable to save Finisher Done Status to server: #{response.body}"
101
+ raise "Unable to save Finisher Done Status to server: #{response.body}"
102
+ end
103
+ end
104
+
105
+ def handle_error(e)
106
+ error = ["Finisher #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
107
+
108
+ finisher_update(
109
+ job_id: job_id,
110
+ finisher_status: :failed,
111
+ log_error: error)
112
+ end
113
+
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,200 @@
1
+ module Datahen
2
+ module Scraper
3
+ class RubyParserExecutor < Executor
4
+ attr_accessor :save
5
+ # Refetch self page flag.
6
+ # @return [Boollean]
7
+ # @note It is stronger than #reparse_self flag.
8
+ attr_accessor :refetch_self
9
+ # Reparse self page flag.
10
+ # @return [Boollean]
11
+ attr_accessor :reparse_self
12
+
13
+ def initialize(options={})
14
+ @filename = options.fetch(:filename) { raise "Filename is required"}
15
+ @gid = options.fetch(:gid) { raise "GID is required"}
16
+ @job_id = options.fetch(:job_id)
17
+ @page_vars = options.fetch(:vars) { {} }
18
+ end
19
+
20
+ def self.exposed_methods
21
+ [
22
+ :content,
23
+ :failed_content,
24
+ :outputs,
25
+ :pages,
26
+ :page,
27
+ :save_pages,
28
+ :save_outputs,
29
+ :find_output,
30
+ :find_outputs,
31
+ :refetch,
32
+ :reparse
33
+ ].freeze
34
+ end
35
+
36
+ def exec_parser(save=false)
37
+ @save = save
38
+ if save
39
+ puts "Executing parser script"
40
+ else
41
+ puts "Trying parser script"
42
+ end
43
+
44
+ eval_parser_script(save)
45
+ end
46
+
47
+ def init_page_vars(page)
48
+ if !@page_vars.nil? && !@page_vars.empty?
49
+ page['vars'] = @page_vars
50
+ end
51
+ page
52
+ end
53
+
54
+ def update_to_server(opts = {})
55
+ parsing_update(
56
+ job_id: opts[:job_id],
57
+ gid: opts[:gid],
58
+ pages: opts[:pages],
59
+ outputs: opts[:outputs],
60
+ parsing_status: opts[:status])
61
+ end
62
+
63
+ def update_parsing_starting_status
64
+ return unless save
65
+
66
+ response = parsing_update(
67
+ job_id: job_id,
68
+ gid: gid,
69
+ parsing_status: :starting)
70
+
71
+ if response.code == 200
72
+ puts "Page Parsing Status Updated."
73
+ else
74
+ puts "Error: Unable to save Page Parsing Status to server: #{response.body}"
75
+ raise "Unable to save Page Parsing Status to server: #{response.body}"
76
+ end
77
+ end
78
+
79
+ def update_parsing_done_status
80
+ return unless save
81
+
82
+ response = parsing_update(
83
+ job_id: job_id,
84
+ gid: gid,
85
+ parsing_status: :done)
86
+
87
+ if response.code == 200
88
+ puts "Page Parsing Done."
89
+ else
90
+ puts "Error: Unable to save Page Parsing Done Status to server: #{response.body}"
91
+ raise "Unable to save Page Parsing Done Status to server: #{response.body}"
92
+ end
93
+ end
94
+
95
+ def save_type
96
+ :parsing
97
+ end
98
+
99
+ def refetch_page gid
100
+ if save
101
+ Client::ScraperJobPage.new({gid: gid}).refetch_by_job(self.job_id)
102
+ puts "Refetch page #{gid}"
103
+ else
104
+ puts "Would have refetch page #{gid}"
105
+ end
106
+ end
107
+
108
+ def refetch page_gid
109
+ raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
110
+ if page_gid == gid
111
+ self.refetch_self = true
112
+ return
113
+ end
114
+ refetch_page page_gid
115
+ end
116
+
117
+ def reparse_page gid
118
+ if save
119
+ Client::ScraperJobPage.new({gid: gid}).reparse_by_job(self.job_id)
120
+ puts "Reparse page #{gid}"
121
+ else
122
+ puts "Would have reparse page #{gid}"
123
+ end
124
+ end
125
+
126
+ def reparse page_gid
127
+ raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
128
+ if page_gid == gid
129
+ self.reparse_self = true
130
+ return
131
+ end
132
+ reparse_page page_gid
133
+ end
134
+
135
+ def eval_parser_script(save=false)
136
+ update_parsing_starting_status
137
+
138
+ proc = Proc.new do
139
+ page = init_page
140
+ outputs = []
141
+ pages = []
142
+ page = init_page_vars(page)
143
+ self.refetch_self = false
144
+ self.reparse_self = false
145
+
146
+ begin
147
+ context = isolated_binding({
148
+ outputs: outputs,
149
+ pages: pages,
150
+ page: page
151
+ })
152
+ eval_with_context filename, context
153
+ rescue SyntaxError => e
154
+ handle_error(e) if save
155
+ raise e
156
+ rescue => e
157
+ handle_error(e) if save
158
+ raise e
159
+ end
160
+
161
+ puts "=========== Parsing Executed ==========="
162
+ begin
163
+ save_pages_and_outputs(pages, outputs, :parsing)
164
+ rescue => e
165
+ handle_error(e) if save
166
+ raise e
167
+ end
168
+
169
+ if refetch_self
170
+ refetch_page gid
171
+ elsif reparse_self
172
+ reparse_page gid
173
+ else
174
+ update_parsing_done_status
175
+ end
176
+ end
177
+ proc.call
178
+ end
179
+
180
+ def content
181
+ @content ||= get_content(gid)
182
+ end
183
+
184
+ def failed_content
185
+ @failed_content ||= get_failed_content(gid)
186
+ end
187
+
188
+ def handle_error(e)
189
+ error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
190
+
191
+ parsing_update(
192
+ job_id: job_id,
193
+ gid: gid,
194
+ parsing_status: :failed,
195
+ log_error: error)
196
+ end
197
+
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,120 @@
1
+ module Datahen
2
+ module Scraper
3
+ class RubySeederExecutor < Executor
4
+ attr_accessor :save
5
+
6
+ def initialize(options={})
7
+ @filename = options.fetch(:filename) { raise "Filename is required"}
8
+ @job_id = options[:job_id]
9
+ end
10
+
11
+ def self.exposed_methods
12
+ [
13
+ :outputs,
14
+ :pages,
15
+ :save_pages,
16
+ :save_outputs,
17
+ :find_output,
18
+ :find_outputs
19
+ ].freeze
20
+ end
21
+
22
+ def exec_seeder(save=false)
23
+ @save = save
24
+ if save
25
+ puts "Executing seeder script"
26
+ else
27
+ puts "Trying seeder script"
28
+ end
29
+
30
+ eval_seeder_script(save)
31
+ end
32
+
33
+ def eval_seeder_script(save=false)
34
+ update_seeding_starting_status
35
+
36
+ proc = Proc.new do
37
+ outputs = []
38
+ pages = []
39
+
40
+ begin
41
+ context = isolated_binding({
42
+ outputs: outputs,
43
+ pages: pages
44
+ })
45
+ eval_with_context filename, context
46
+ rescue SyntaxError => e
47
+ handle_error(e) if save
48
+ raise e
49
+ rescue => e
50
+ handle_error(e) if save
51
+ raise e
52
+ end
53
+
54
+ puts "=========== Seeding Executed ==========="
55
+ begin
56
+ save_pages_and_outputs(pages, outputs, :seeding)
57
+ rescue => e
58
+ handle_error(e) if save
59
+ raise e
60
+ end
61
+
62
+ update_seeding_done_status
63
+ end
64
+ proc.call
65
+ end
66
+
67
+ def save_type
68
+ :seeding
69
+ end
70
+
71
+ def update_to_server(opts = {})
72
+ seeding_update(
73
+ job_id: opts[:job_id],
74
+ pages: opts[:pages],
75
+ outputs: opts[:outputs],
76
+ seeding_status: opts[:status])
77
+ end
78
+
79
+ def update_seeding_starting_status
80
+ return unless save
81
+
82
+ response = seeding_update(
83
+ job_id: job_id,
84
+ seeding_status: :starting)
85
+
86
+ if response.code == 200
87
+ puts "Seeding Status Updated."
88
+ else
89
+ puts "Error: Unable to save Seeding Status to server: #{response.body}"
90
+ raise "Unable to save Seeding Status to server: #{response.body}"
91
+ end
92
+ end
93
+
94
+ def update_seeding_done_status
95
+ return unless save
96
+
97
+ response = seeding_update(
98
+ job_id: job_id,
99
+ seeding_status: :done)
100
+
101
+ if response.code == 200
102
+ puts "Seeding Done."
103
+ else
104
+ puts "Error: Unable to save Seeding Done Status to server: #{response.body}"
105
+ raise "Unable to save Seeding Done Status to server: #{response.body}"
106
+ end
107
+ end
108
+
109
+ def handle_error(e)
110
+ error = ["Seeding #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
111
+
112
+ seeding_update(
113
+ job_id: job_id,
114
+ seeding_status: :failed,
115
+ log_error: error)
116
+ end
117
+
118
+ end
119
+ end
120
+ end