answersengine 0.2.33

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +30 -0
  8. data/Rakefile +22 -0
  9. data/answersengine.gemspec +45 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/answersengine +3 -0
  25. data/lib/answersengine.rb +5 -0
  26. data/lib/answersengine/cli.rb +33 -0
  27. data/lib/answersengine/cli/global_page.rb +39 -0
  28. data/lib/answersengine/cli/job.rb +30 -0
  29. data/lib/answersengine/cli/job_output.rb +69 -0
  30. data/lib/answersengine/cli/parser.rb +64 -0
  31. data/lib/answersengine/cli/scraper.rb +172 -0
  32. data/lib/answersengine/cli/scraper_deployment.rb +24 -0
  33. data/lib/answersengine/cli/scraper_export.rb +51 -0
  34. data/lib/answersengine/cli/scraper_exporter.rb +40 -0
  35. data/lib/answersengine/cli/scraper_job.rb +71 -0
  36. data/lib/answersengine/cli/scraper_page.rb +200 -0
  37. data/lib/answersengine/cli/seeder.rb +40 -0
  38. data/lib/answersengine/client.rb +23 -0
  39. data/lib/answersengine/client/backblaze_content.rb +45 -0
  40. data/lib/answersengine/client/base.rb +50 -0
  41. data/lib/answersengine/client/export.rb +10 -0
  42. data/lib/answersengine/client/global_page.rb +18 -0
  43. data/lib/answersengine/client/job.rb +53 -0
  44. data/lib/answersengine/client/job_export.rb +10 -0
  45. data/lib/answersengine/client/job_log.rb +27 -0
  46. data/lib/answersengine/client/job_output.rb +19 -0
  47. data/lib/answersengine/client/job_page.rb +62 -0
  48. data/lib/answersengine/client/job_stat.rb +16 -0
  49. data/lib/answersengine/client/scraper.rb +54 -0
  50. data/lib/answersengine/client/scraper_deployment.rb +17 -0
  51. data/lib/answersengine/client/scraper_export.rb +22 -0
  52. data/lib/answersengine/client/scraper_exporter.rb +14 -0
  53. data/lib/answersengine/client/scraper_job.rb +49 -0
  54. data/lib/answersengine/client/scraper_job_output.rb +19 -0
  55. data/lib/answersengine/client/scraper_job_page.rb +55 -0
  56. data/lib/answersengine/plugin.rb +6 -0
  57. data/lib/answersengine/plugin/context_exposer.rb +55 -0
  58. data/lib/answersengine/scraper.rb +16 -0
  59. data/lib/answersengine/scraper/executor.rb +292 -0
  60. data/lib/answersengine/scraper/parser.rb +18 -0
  61. data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
  62. data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
  63. data/lib/answersengine/scraper/seeder.rb +18 -0
  64. data/lib/answersengine/version.rb +3 -0
  65. metadata +255 -0
@@ -0,0 +1,18 @@
1
+ module AnswersEngine
2
+ module Scraper
3
+ class Parser
4
+ def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
5
+ extname = File.extname(filename)
6
+ case extname
7
+ when '.rb'
8
+ executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
9
+ executor.exec_parser(save)
10
+ else
11
+ puts "Unable to find a parser executor for file type \"#{extname}\""
12
+ end
13
+ end
14
+
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,141 @@
1
+ module AnswersEngine
2
+ module Scraper
3
+ class RubyParserExecutor < Executor
4
+ attr_accessor :save
5
+
6
+ def initialize(options={})
7
+ @filename = options.fetch(:filename) { raise "Filename is required"}
8
+ @gid = options.fetch(:gid) { raise "GID is required"}
9
+ @job_id = options.fetch(:job_id)
10
+ @page_vars = options.fetch(:vars) { {} }
11
+ end
12
+
13
+ def self.exposed_methods
14
+ [
15
+ :content,
16
+ :failed_content,
17
+ :outputs,
18
+ :pages,
19
+ :page,
20
+ :save_pages,
21
+ :save_outputs,
22
+ :find_output,
23
+ :find_outputs
24
+ ].freeze
25
+ end
26
+
27
+ def exec_parser(save=false)
28
+ @save = save
29
+ if save
30
+ puts "Executing parser script"
31
+ else
32
+ puts "Trying parser script"
33
+ end
34
+
35
+ eval_parser_script(save)
36
+ end
37
+
38
+ def init_page_vars(page)
39
+ if !@page_vars.nil? && !@page_vars.empty?
40
+ page['vars'] = @page_vars
41
+ end
42
+ page
43
+ end
44
+
45
+ def update_to_server(opts = {})
46
+ parsing_update(
47
+ job_id: opts[:job_id],
48
+ gid: opts[:gid],
49
+ pages: opts[:pages],
50
+ outputs: opts[:outputs],
51
+ parsing_status: opts[:status])
52
+ end
53
+
54
+ def update_parsing_starting_status
55
+ return unless save
56
+
57
+ response = parsing_update(
58
+ job_id: job_id,
59
+ gid: gid,
60
+ parsing_status: :starting)
61
+
62
+ if response.code == 200
63
+ puts "Page Parsing Status Updated."
64
+ else
65
+ puts "Error: Unable to save Page Parsing Status to server: #{response.body}"
66
+ raise "Unable to save Page Parsing Status to server: #{response.body}"
67
+ end
68
+ end
69
+
70
+ def update_parsing_done_status
71
+ return unless save
72
+
73
+ response = parsing_update(
74
+ job_id: job_id,
75
+ gid: gid,
76
+ parsing_status: :done)
77
+
78
+ if response.code == 200
79
+ puts "Page Parsing Done."
80
+ else
81
+ puts "Error: Unable to save Page Parsing Done Status to server: #{response.body}"
82
+ raise "Unable to save Page Parsing Done Status to server: #{response.body}"
83
+ end
84
+ end
85
+
86
+ def save_type
87
+ :parsing
88
+ end
89
+
90
+ def eval_parser_script(save=false)
91
+ update_parsing_starting_status
92
+
93
+ proc = Proc.new do
94
+ page = init_page
95
+ outputs = []
96
+ pages = []
97
+ page = init_page_vars(page)
98
+
99
+ begin
100
+ context = isolated_binding({
101
+ outputs: outputs,
102
+ pages: pages,
103
+ page: page
104
+ })
105
+ eval_with_context filename, context
106
+ rescue SyntaxError => e
107
+ handle_error(e) if save
108
+ raise e
109
+ rescue => e
110
+ handle_error(e) if save
111
+ raise e
112
+ end
113
+
114
+ puts "=========== Parsing Executed ==========="
115
+ save_pages_and_outputs(pages, outputs, :parsing)
116
+ update_parsing_done_status
117
+ end
118
+ proc.call
119
+ end
120
+
121
+ def content
122
+ @content ||= get_content(gid)
123
+ end
124
+
125
+ def failed_content
126
+ @failed_content ||= get_failed_content(gid)
127
+ end
128
+
129
+ def handle_error(e)
130
+ error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
131
+
132
+ parsing_update(
133
+ job_id: job_id,
134
+ gid: gid,
135
+ parsing_status: :failed,
136
+ log_error: error)
137
+ end
138
+
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,114 @@
1
+ module AnswersEngine
2
+ module Scraper
3
+ class RubySeederExecutor < Executor
4
+ attr_accessor :save
5
+
6
+ def initialize(options={})
7
+ @filename = options.fetch(:filename) { raise "Filename is required"}
8
+ @job_id = options[:job_id]
9
+ end
10
+
11
+ def self.exposed_methods
12
+ [
13
+ :outputs,
14
+ :pages,
15
+ :save_pages,
16
+ :save_outputs,
17
+ :find_output,
18
+ :find_outputs
19
+ ].freeze
20
+ end
21
+
22
+ def exec_seeder(save=false)
23
+ @save = save
24
+ if save
25
+ puts "Executing seeder script"
26
+ else
27
+ puts "Trying seeder script"
28
+ end
29
+
30
+ eval_seeder_script(save)
31
+ end
32
+
33
+ def eval_seeder_script(save=false)
34
+ update_seeding_starting_status
35
+
36
+ proc = Proc.new do
37
+ outputs = []
38
+ pages = []
39
+
40
+ begin
41
+ context = isolated_binding({
42
+ outputs: outputs,
43
+ pages: pages
44
+ })
45
+ eval_with_context filename, context
46
+ rescue SyntaxError => e
47
+ handle_error(e) if save
48
+ raise e
49
+ rescue => e
50
+ handle_error(e) if save
51
+ raise e
52
+ end
53
+
54
+ puts "=========== Seeding Executed ==========="
55
+ save_pages_and_outputs(pages, outputs, :seeding)
56
+ update_seeding_done_status
57
+ end
58
+ proc.call
59
+ end
60
+
61
+ def save_type
62
+ :seeding
63
+ end
64
+
65
+ def update_to_server(opts = {})
66
+ seeding_update(
67
+ job_id: opts[:job_id],
68
+ pages: opts[:pages],
69
+ outputs: opts[:outputs],
70
+ seeding_status: opts[:status])
71
+ end
72
+
73
+ def update_seeding_starting_status
74
+ return unless save
75
+
76
+ response = seeding_update(
77
+ job_id: job_id,
78
+ seeding_status: :starting)
79
+
80
+ if response.code == 200
81
+ puts "Seeding Status Updated."
82
+ else
83
+ puts "Error: Unable to save Seeding Status to server: #{response.body}"
84
+ raise "Unable to save Seeding Status to server: #{response.body}"
85
+ end
86
+ end
87
+
88
+ def update_seeding_done_status
89
+ return unless save
90
+
91
+ response = seeding_update(
92
+ job_id: job_id,
93
+ seeding_status: :done)
94
+
95
+ if response.code == 200
96
+ puts "Seeding Done."
97
+ else
98
+ puts "Error: Unable to save Seeding Done Status to server: #{response.body}"
99
+ raise "Unable to save Seeding Done Status to server: #{response.body}"
100
+ end
101
+ end
102
+
103
+ def handle_error(e)
104
+ error = ["Seeding #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
105
+
106
+ seeding_update(
107
+ job_id: job_id,
108
+ seeding_status: :failed,
109
+ log_error: error)
110
+ end
111
+
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,18 @@
1
+ module AnswersEngine
2
+ module Scraper
3
+ class Seeder
4
+
5
+ def self.exec_seeder(filename, job_id=nil, save=false)
6
+ extname = File.extname(filename)
7
+ case extname
8
+ when '.rb'
9
+ executor = RubySeederExecutor.new(filename: filename, job_id: job_id)
10
+ executor.exec_seeder(save)
11
+ else
12
+ puts "Unable to find a seeder executor for file type \"#{extname}\""
13
+ end
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,3 @@
1
+ module AnswersEngine
2
+ VERSION = "0.2.33"
3
+ end
metadata ADDED
@@ -0,0 +1,255 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: answersengine
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.33
5
+ platform: ruby
6
+ authors:
7
+ - Parama Danoesubroto
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-03-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.20.3
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.20.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: httparty
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.16.2
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.16.2
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ - - "<"
49
+ - !ruby/object:Gem::Version
50
+ version: '1.10'
51
+ type: :runtime
52
+ prerelease: false
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: '1.6'
58
+ - - "<"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.10'
61
+ - !ruby/object:Gem::Dependency
62
+ name: bundler
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '1.16'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '1.16'
75
+ - !ruby/object:Gem::Dependency
76
+ name: rake
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '10.0'
82
+ type: :development
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '10.0'
89
+ - !ruby/object:Gem::Dependency
90
+ name: minitest
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '5.11'
96
+ type: :development
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '5.11'
103
+ - !ruby/object:Gem::Dependency
104
+ name: simplecov
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 0.16.1
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: 0.16.1
117
+ - !ruby/object:Gem::Dependency
118
+ name: simplecov-console
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: 0.4.2
124
+ type: :development
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: 0.4.2
131
+ - !ruby/object:Gem::Dependency
132
+ name: timecop
133
+ requirement: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: 0.9.1
138
+ type: :development
139
+ prerelease: false
140
+ version_requirements: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: 0.9.1
145
+ - !ruby/object:Gem::Dependency
146
+ name: byebug
147
+ requirement: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ type: :development
153
+ prerelease: false
154
+ version_requirements: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - ">="
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ description: AnswersEngine toolbelt to develop scrapers and other scripts
160
+ email:
161
+ - parama@answersengine.com
162
+ executables:
163
+ - answersengine
164
+ extensions: []
165
+ extra_rdoc_files: []
166
+ files:
167
+ - ".gitignore"
168
+ - ".travis.yml"
169
+ - CODE_OF_CONDUCT.md
170
+ - Gemfile
171
+ - LICENSE.txt
172
+ - README.md
173
+ - Rakefile
174
+ - answersengine.gemspec
175
+ - bin/console
176
+ - bin/setup
177
+ - examples/fetchtest/libraries/hello.rb
178
+ - examples/fetchtest/libraries/hello_fail.rb
179
+ - examples/fetchtest/parsers/failed.rb
180
+ - examples/fetchtest/parsers/find_outputs.rb
181
+ - examples/fetchtest/parsers/home.rb
182
+ - examples/fetchtest/parsers/nested_fail.rb
183
+ - examples/fetchtest/parsers/simple.rb
184
+ - examples/fetchtest/seeders/csv_seeder.rb
185
+ - examples/fetchtest/seeders/failed.rb
186
+ - examples/fetchtest/seeders/list_of_urls.csv
187
+ - examples/fetchtest/seeders/seed.rb
188
+ - examples/fetchtest/seeders/test_reset_page.rb
189
+ - exe/answersengine
190
+ - lib/answersengine.rb
191
+ - lib/answersengine/cli.rb
192
+ - lib/answersengine/cli/global_page.rb
193
+ - lib/answersengine/cli/job.rb
194
+ - lib/answersengine/cli/job_output.rb
195
+ - lib/answersengine/cli/parser.rb
196
+ - lib/answersengine/cli/scraper.rb
197
+ - lib/answersengine/cli/scraper_deployment.rb
198
+ - lib/answersengine/cli/scraper_export.rb
199
+ - lib/answersengine/cli/scraper_exporter.rb
200
+ - lib/answersengine/cli/scraper_job.rb
201
+ - lib/answersengine/cli/scraper_page.rb
202
+ - lib/answersengine/cli/seeder.rb
203
+ - lib/answersengine/client.rb
204
+ - lib/answersengine/client/backblaze_content.rb
205
+ - lib/answersengine/client/base.rb
206
+ - lib/answersengine/client/export.rb
207
+ - lib/answersengine/client/global_page.rb
208
+ - lib/answersengine/client/job.rb
209
+ - lib/answersengine/client/job_export.rb
210
+ - lib/answersengine/client/job_log.rb
211
+ - lib/answersengine/client/job_output.rb
212
+ - lib/answersengine/client/job_page.rb
213
+ - lib/answersengine/client/job_stat.rb
214
+ - lib/answersengine/client/scraper.rb
215
+ - lib/answersengine/client/scraper_deployment.rb
216
+ - lib/answersengine/client/scraper_export.rb
217
+ - lib/answersengine/client/scraper_exporter.rb
218
+ - lib/answersengine/client/scraper_job.rb
219
+ - lib/answersengine/client/scraper_job_output.rb
220
+ - lib/answersengine/client/scraper_job_page.rb
221
+ - lib/answersengine/plugin.rb
222
+ - lib/answersengine/plugin/context_exposer.rb
223
+ - lib/answersengine/scraper.rb
224
+ - lib/answersengine/scraper/executor.rb
225
+ - lib/answersengine/scraper/parser.rb
226
+ - lib/answersengine/scraper/ruby_parser_executor.rb
227
+ - lib/answersengine/scraper/ruby_seeder_executor.rb
228
+ - lib/answersengine/scraper/seeder.rb
229
+ - lib/answersengine/version.rb
230
+ homepage: https://answersengine.com
231
+ licenses:
232
+ - MIT
233
+ metadata:
234
+ allowed_push_host: https://rubygems.org
235
+ post_install_message:
236
+ rdoc_options: []
237
+ require_paths:
238
+ - lib
239
+ required_ruby_version: !ruby/object:Gem::Requirement
240
+ requirements:
241
+ - - ">="
242
+ - !ruby/object:Gem::Version
243
+ version: 2.2.2
244
+ required_rubygems_version: !ruby/object:Gem::Requirement
245
+ requirements:
246
+ - - ">="
247
+ - !ruby/object:Gem::Version
248
+ version: '0'
249
+ requirements: []
250
+ rubyforge_project:
251
+ rubygems_version: 2.6.14.1
252
+ signing_key:
253
+ specification_version: 4
254
+ summary: AnswersEngine toolbelt for developers
255
+ test_files: []