answersengine 0.2.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +7 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +30 -0
  8. data/Rakefile +22 -0
  9. data/answersengine.gemspec +45 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +8 -0
  12. data/examples/fetchtest/libraries/hello.rb +9 -0
  13. data/examples/fetchtest/libraries/hello_fail.rb +10 -0
  14. data/examples/fetchtest/parsers/failed.rb +2 -0
  15. data/examples/fetchtest/parsers/find_outputs.rb +18 -0
  16. data/examples/fetchtest/parsers/home.rb +50 -0
  17. data/examples/fetchtest/parsers/nested_fail.rb +3 -0
  18. data/examples/fetchtest/parsers/simple.rb +14 -0
  19. data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
  20. data/examples/fetchtest/seeders/failed.rb +1 -0
  21. data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
  22. data/examples/fetchtest/seeders/seed.rb +28 -0
  23. data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
  24. data/exe/answersengine +3 -0
  25. data/lib/answersengine.rb +5 -0
  26. data/lib/answersengine/cli.rb +33 -0
  27. data/lib/answersengine/cli/global_page.rb +39 -0
  28. data/lib/answersengine/cli/job.rb +30 -0
  29. data/lib/answersengine/cli/job_output.rb +69 -0
  30. data/lib/answersengine/cli/parser.rb +64 -0
  31. data/lib/answersengine/cli/scraper.rb +172 -0
  32. data/lib/answersengine/cli/scraper_deployment.rb +24 -0
  33. data/lib/answersengine/cli/scraper_export.rb +51 -0
  34. data/lib/answersengine/cli/scraper_exporter.rb +40 -0
  35. data/lib/answersengine/cli/scraper_job.rb +71 -0
  36. data/lib/answersengine/cli/scraper_page.rb +200 -0
  37. data/lib/answersengine/cli/seeder.rb +40 -0
  38. data/lib/answersengine/client.rb +23 -0
  39. data/lib/answersengine/client/backblaze_content.rb +45 -0
  40. data/lib/answersengine/client/base.rb +50 -0
  41. data/lib/answersengine/client/export.rb +10 -0
  42. data/lib/answersengine/client/global_page.rb +18 -0
  43. data/lib/answersengine/client/job.rb +53 -0
  44. data/lib/answersengine/client/job_export.rb +10 -0
  45. data/lib/answersengine/client/job_log.rb +27 -0
  46. data/lib/answersengine/client/job_output.rb +19 -0
  47. data/lib/answersengine/client/job_page.rb +62 -0
  48. data/lib/answersengine/client/job_stat.rb +16 -0
  49. data/lib/answersengine/client/scraper.rb +54 -0
  50. data/lib/answersengine/client/scraper_deployment.rb +17 -0
  51. data/lib/answersengine/client/scraper_export.rb +22 -0
  52. data/lib/answersengine/client/scraper_exporter.rb +14 -0
  53. data/lib/answersengine/client/scraper_job.rb +49 -0
  54. data/lib/answersengine/client/scraper_job_output.rb +19 -0
  55. data/lib/answersengine/client/scraper_job_page.rb +55 -0
  56. data/lib/answersengine/plugin.rb +6 -0
  57. data/lib/answersengine/plugin/context_exposer.rb +55 -0
  58. data/lib/answersengine/scraper.rb +16 -0
  59. data/lib/answersengine/scraper/executor.rb +292 -0
  60. data/lib/answersengine/scraper/parser.rb +18 -0
  61. data/lib/answersengine/scraper/ruby_parser_executor.rb +141 -0
  62. data/lib/answersengine/scraper/ruby_seeder_executor.rb +114 -0
  63. data/lib/answersengine/scraper/seeder.rb +18 -0
  64. data/lib/answersengine/version.rb +3 -0
  65. metadata +255 -0
@@ -0,0 +1,18 @@
1
+ module AnswersEngine
2
+ module Scraper
3
+ class Parser
4
+ def self.exec_parser_page(filename, gid, job_id=nil, save=false, vars = {})
5
+ extname = File.extname(filename)
6
+ case extname
7
+ when '.rb'
8
+ executor = RubyParserExecutor.new(filename: filename, gid: gid, job_id: job_id, vars: vars)
9
+ executor.exec_parser(save)
10
+ else
11
+ puts "Unable to find a parser executor for file type \"#{extname}\""
12
+ end
13
+ end
14
+
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,141 @@
1
+ module AnswersEngine
2
+ module Scraper
3
+ class RubyParserExecutor < Executor
4
+ attr_accessor :save
5
+
6
+ def initialize(options={})
7
+ @filename = options.fetch(:filename) { raise "Filename is required"}
8
+ @gid = options.fetch(:gid) { raise "GID is required"}
9
+ @job_id = options.fetch(:job_id)
10
+ @page_vars = options.fetch(:vars) { {} }
11
+ end
12
+
13
+ def self.exposed_methods
14
+ [
15
+ :content,
16
+ :failed_content,
17
+ :outputs,
18
+ :pages,
19
+ :page,
20
+ :save_pages,
21
+ :save_outputs,
22
+ :find_output,
23
+ :find_outputs
24
+ ].freeze
25
+ end
26
+
27
+ def exec_parser(save=false)
28
+ @save = save
29
+ if save
30
+ puts "Executing parser script"
31
+ else
32
+ puts "Trying parser script"
33
+ end
34
+
35
+ eval_parser_script(save)
36
+ end
37
+
38
+ def init_page_vars(page)
39
+ if !@page_vars.nil? && !@page_vars.empty?
40
+ page['vars'] = @page_vars
41
+ end
42
+ page
43
+ end
44
+
45
+ def update_to_server(opts = {})
46
+ parsing_update(
47
+ job_id: opts[:job_id],
48
+ gid: opts[:gid],
49
+ pages: opts[:pages],
50
+ outputs: opts[:outputs],
51
+ parsing_status: opts[:status])
52
+ end
53
+
54
+ def update_parsing_starting_status
55
+ return unless save
56
+
57
+ response = parsing_update(
58
+ job_id: job_id,
59
+ gid: gid,
60
+ parsing_status: :starting)
61
+
62
+ if response.code == 200
63
+ puts "Page Parsing Status Updated."
64
+ else
65
+ puts "Error: Unable to save Page Parsing Status to server: #{response.body}"
66
+ raise "Unable to save Page Parsing Status to server: #{response.body}"
67
+ end
68
+ end
69
+
70
+ def update_parsing_done_status
71
+ return unless save
72
+
73
+ response = parsing_update(
74
+ job_id: job_id,
75
+ gid: gid,
76
+ parsing_status: :done)
77
+
78
+ if response.code == 200
79
+ puts "Page Parsing Done."
80
+ else
81
+ puts "Error: Unable to save Page Parsing Done Status to server: #{response.body}"
82
+ raise "Unable to save Page Parsing Done Status to server: #{response.body}"
83
+ end
84
+ end
85
+
86
+ def save_type
87
+ :parsing
88
+ end
89
+
90
+ def eval_parser_script(save=false)
91
+ update_parsing_starting_status
92
+
93
+ proc = Proc.new do
94
+ page = init_page
95
+ outputs = []
96
+ pages = []
97
+ page = init_page_vars(page)
98
+
99
+ begin
100
+ context = isolated_binding({
101
+ outputs: outputs,
102
+ pages: pages,
103
+ page: page
104
+ })
105
+ eval_with_context filename, context
106
+ rescue SyntaxError => e
107
+ handle_error(e) if save
108
+ raise e
109
+ rescue => e
110
+ handle_error(e) if save
111
+ raise e
112
+ end
113
+
114
+ puts "=========== Parsing Executed ==========="
115
+ save_pages_and_outputs(pages, outputs, :parsing)
116
+ update_parsing_done_status
117
+ end
118
+ proc.call
119
+ end
120
+
121
+ def content
122
+ @content ||= get_content(gid)
123
+ end
124
+
125
+ def failed_content
126
+ @failed_content ||= get_failed_content(gid)
127
+ end
128
+
129
+ def handle_error(e)
130
+ error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
131
+
132
+ parsing_update(
133
+ job_id: job_id,
134
+ gid: gid,
135
+ parsing_status: :failed,
136
+ log_error: error)
137
+ end
138
+
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,114 @@
1
+ module AnswersEngine
2
+ module Scraper
3
+ class RubySeederExecutor < Executor
4
+ attr_accessor :save
5
+
6
+ def initialize(options={})
7
+ @filename = options.fetch(:filename) { raise "Filename is required"}
8
+ @job_id = options[:job_id]
9
+ end
10
+
11
+ def self.exposed_methods
12
+ [
13
+ :outputs,
14
+ :pages,
15
+ :save_pages,
16
+ :save_outputs,
17
+ :find_output,
18
+ :find_outputs
19
+ ].freeze
20
+ end
21
+
22
+ def exec_seeder(save=false)
23
+ @save = save
24
+ if save
25
+ puts "Executing seeder script"
26
+ else
27
+ puts "Trying seeder script"
28
+ end
29
+
30
+ eval_seeder_script(save)
31
+ end
32
+
33
+ def eval_seeder_script(save=false)
34
+ update_seeding_starting_status
35
+
36
+ proc = Proc.new do
37
+ outputs = []
38
+ pages = []
39
+
40
+ begin
41
+ context = isolated_binding({
42
+ outputs: outputs,
43
+ pages: pages
44
+ })
45
+ eval_with_context filename, context
46
+ rescue SyntaxError => e
47
+ handle_error(e) if save
48
+ raise e
49
+ rescue => e
50
+ handle_error(e) if save
51
+ raise e
52
+ end
53
+
54
+ puts "=========== Seeding Executed ==========="
55
+ save_pages_and_outputs(pages, outputs, :seeding)
56
+ update_seeding_done_status
57
+ end
58
+ proc.call
59
+ end
60
+
61
+ def save_type
62
+ :seeding
63
+ end
64
+
65
+ def update_to_server(opts = {})
66
+ seeding_update(
67
+ job_id: opts[:job_id],
68
+ pages: opts[:pages],
69
+ outputs: opts[:outputs],
70
+ seeding_status: opts[:status])
71
+ end
72
+
73
+ def update_seeding_starting_status
74
+ return unless save
75
+
76
+ response = seeding_update(
77
+ job_id: job_id,
78
+ seeding_status: :starting)
79
+
80
+ if response.code == 200
81
+ puts "Seeding Status Updated."
82
+ else
83
+ puts "Error: Unable to save Seeding Status to server: #{response.body}"
84
+ raise "Unable to save Seeding Status to server: #{response.body}"
85
+ end
86
+ end
87
+
88
+ def update_seeding_done_status
89
+ return unless save
90
+
91
+ response = seeding_update(
92
+ job_id: job_id,
93
+ seeding_status: :done)
94
+
95
+ if response.code == 200
96
+ puts "Seeding Done."
97
+ else
98
+ puts "Error: Unable to save Seeding Done Status to server: #{response.body}"
99
+ raise "Unable to save Seeding Done Status to server: #{response.body}"
100
+ end
101
+ end
102
+
103
+ def handle_error(e)
104
+ error = ["Seeding #{e.class}: #{e.to_s} (Job:#{job_id}",clean_backtrace(e.backtrace)].join("\n")
105
+
106
+ seeding_update(
107
+ job_id: job_id,
108
+ seeding_status: :failed,
109
+ log_error: error)
110
+ end
111
+
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,18 @@
1
+ module AnswersEngine
2
+ module Scraper
3
+ class Seeder
4
+
5
+ def self.exec_seeder(filename, job_id=nil, save=false)
6
+ extname = File.extname(filename)
7
+ case extname
8
+ when '.rb'
9
+ executor = RubySeederExecutor.new(filename: filename, job_id: job_id)
10
+ executor.exec_seeder(save)
11
+ else
12
+ puts "Unable to find a seeder executor for file type \"#{extname}\""
13
+ end
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,3 @@
1
+ module AnswersEngine
2
+ VERSION = "0.2.33"
3
+ end
metadata ADDED
@@ -0,0 +1,255 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: answersengine
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.33
5
+ platform: ruby
6
+ authors:
7
+ - Parama Danoesubroto
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-03-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.20.3
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.20.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: httparty
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.16.2
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.16.2
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ - - "<"
49
+ - !ruby/object:Gem::Version
50
+ version: '1.10'
51
+ type: :runtime
52
+ prerelease: false
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: '1.6'
58
+ - - "<"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.10'
61
+ - !ruby/object:Gem::Dependency
62
+ name: bundler
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '1.16'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '1.16'
75
+ - !ruby/object:Gem::Dependency
76
+ name: rake
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '10.0'
82
+ type: :development
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '10.0'
89
+ - !ruby/object:Gem::Dependency
90
+ name: minitest
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '5.11'
96
+ type: :development
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '5.11'
103
+ - !ruby/object:Gem::Dependency
104
+ name: simplecov
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 0.16.1
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: 0.16.1
117
+ - !ruby/object:Gem::Dependency
118
+ name: simplecov-console
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: 0.4.2
124
+ type: :development
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: 0.4.2
131
+ - !ruby/object:Gem::Dependency
132
+ name: timecop
133
+ requirement: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: 0.9.1
138
+ type: :development
139
+ prerelease: false
140
+ version_requirements: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: 0.9.1
145
+ - !ruby/object:Gem::Dependency
146
+ name: byebug
147
+ requirement: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ type: :development
153
+ prerelease: false
154
+ version_requirements: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - ">="
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ description: AnswersEngine toolbelt to develop scrapers and other scripts
160
+ email:
161
+ - parama@answersengine.com
162
+ executables:
163
+ - answersengine
164
+ extensions: []
165
+ extra_rdoc_files: []
166
+ files:
167
+ - ".gitignore"
168
+ - ".travis.yml"
169
+ - CODE_OF_CONDUCT.md
170
+ - Gemfile
171
+ - LICENSE.txt
172
+ - README.md
173
+ - Rakefile
174
+ - answersengine.gemspec
175
+ - bin/console
176
+ - bin/setup
177
+ - examples/fetchtest/libraries/hello.rb
178
+ - examples/fetchtest/libraries/hello_fail.rb
179
+ - examples/fetchtest/parsers/failed.rb
180
+ - examples/fetchtest/parsers/find_outputs.rb
181
+ - examples/fetchtest/parsers/home.rb
182
+ - examples/fetchtest/parsers/nested_fail.rb
183
+ - examples/fetchtest/parsers/simple.rb
184
+ - examples/fetchtest/seeders/csv_seeder.rb
185
+ - examples/fetchtest/seeders/failed.rb
186
+ - examples/fetchtest/seeders/list_of_urls.csv
187
+ - examples/fetchtest/seeders/seed.rb
188
+ - examples/fetchtest/seeders/test_reset_page.rb
189
+ - exe/answersengine
190
+ - lib/answersengine.rb
191
+ - lib/answersengine/cli.rb
192
+ - lib/answersengine/cli/global_page.rb
193
+ - lib/answersengine/cli/job.rb
194
+ - lib/answersengine/cli/job_output.rb
195
+ - lib/answersengine/cli/parser.rb
196
+ - lib/answersengine/cli/scraper.rb
197
+ - lib/answersengine/cli/scraper_deployment.rb
198
+ - lib/answersengine/cli/scraper_export.rb
199
+ - lib/answersengine/cli/scraper_exporter.rb
200
+ - lib/answersengine/cli/scraper_job.rb
201
+ - lib/answersengine/cli/scraper_page.rb
202
+ - lib/answersengine/cli/seeder.rb
203
+ - lib/answersengine/client.rb
204
+ - lib/answersengine/client/backblaze_content.rb
205
+ - lib/answersengine/client/base.rb
206
+ - lib/answersengine/client/export.rb
207
+ - lib/answersengine/client/global_page.rb
208
+ - lib/answersengine/client/job.rb
209
+ - lib/answersengine/client/job_export.rb
210
+ - lib/answersengine/client/job_log.rb
211
+ - lib/answersengine/client/job_output.rb
212
+ - lib/answersengine/client/job_page.rb
213
+ - lib/answersengine/client/job_stat.rb
214
+ - lib/answersengine/client/scraper.rb
215
+ - lib/answersengine/client/scraper_deployment.rb
216
+ - lib/answersengine/client/scraper_export.rb
217
+ - lib/answersengine/client/scraper_exporter.rb
218
+ - lib/answersengine/client/scraper_job.rb
219
+ - lib/answersengine/client/scraper_job_output.rb
220
+ - lib/answersengine/client/scraper_job_page.rb
221
+ - lib/answersengine/plugin.rb
222
+ - lib/answersengine/plugin/context_exposer.rb
223
+ - lib/answersengine/scraper.rb
224
+ - lib/answersengine/scraper/executor.rb
225
+ - lib/answersengine/scraper/parser.rb
226
+ - lib/answersengine/scraper/ruby_parser_executor.rb
227
+ - lib/answersengine/scraper/ruby_seeder_executor.rb
228
+ - lib/answersengine/scraper/seeder.rb
229
+ - lib/answersengine/version.rb
230
+ homepage: https://answersengine.com
231
+ licenses:
232
+ - MIT
233
+ metadata:
234
+ allowed_push_host: https://rubygems.org
235
+ post_install_message:
236
+ rdoc_options: []
237
+ require_paths:
238
+ - lib
239
+ required_ruby_version: !ruby/object:Gem::Requirement
240
+ requirements:
241
+ - - ">="
242
+ - !ruby/object:Gem::Version
243
+ version: 2.2.2
244
+ required_rubygems_version: !ruby/object:Gem::Requirement
245
+ requirements:
246
+ - - ">="
247
+ - !ruby/object:Gem::Version
248
+ version: '0'
249
+ requirements: []
250
+ rubyforge_project:
251
+ rubygems_version: 2.6.14.1
252
+ signing_key:
253
+ specification_version: 4
254
+ summary: AnswersEngine toolbelt for developers
255
+ test_files: []