ae_easy-core 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/CODE_OF_CONDUCT.md +1 -1
  3. data/Gemfile +1 -1
  4. data/LICENSE +1 -1
  5. data/README.md +8 -4
  6. data/Rakefile +0 -10
  7. data/ae_easy-core.gemspec +6 -13
  8. data/lib/ae_easy/core.rb +4 -256
  9. metadata +18 -125
  10. data/doc/AeEasy.html +0 -117
  11. data/doc/AeEasy/Core.html +0 -1590
  12. data/doc/AeEasy/Core/Config.html +0 -311
  13. data/doc/AeEasy/Core/Exception.html +0 -117
  14. data/doc/AeEasy/Core/Exception/OutdatedError.html +0 -135
  15. data/doc/AeEasy/Core/Helper.html +0 -117
  16. data/doc/AeEasy/Core/Helper/Cookie.html +0 -1070
  17. data/doc/AeEasy/Core/Mock.html +0 -282
  18. data/doc/AeEasy/Core/Mock/FakeDb.html +0 -3779
  19. data/doc/AeEasy/Core/Mock/FakeExecutor.html +0 -3289
  20. data/doc/AeEasy/Core/Mock/FakeFinisher.html +0 -160
  21. data/doc/AeEasy/Core/Mock/FakeParser.html +0 -160
  22. data/doc/AeEasy/Core/Mock/FakeSeeder.html +0 -160
  23. data/doc/AeEasy/Core/Plugin.html +0 -117
  24. data/doc/AeEasy/Core/Plugin/CollectionVault.html +0 -299
  25. data/doc/AeEasy/Core/Plugin/ConfigBehavior.html +0 -541
  26. data/doc/AeEasy/Core/Plugin/ContextIntegrator.html +0 -445
  27. data/doc/AeEasy/Core/Plugin/Executor.html +0 -259
  28. data/doc/AeEasy/Core/Plugin/ExecutorBehavior.html +0 -344
  29. data/doc/AeEasy/Core/Plugin/Finisher.html +0 -265
  30. data/doc/AeEasy/Core/Plugin/FinisherBehavior.html +0 -142
  31. data/doc/AeEasy/Core/Plugin/InitializeHook.html +0 -220
  32. data/doc/AeEasy/Core/Plugin/Parser.html +0 -270
  33. data/doc/AeEasy/Core/Plugin/ParserBehavior.html +0 -235
  34. data/doc/AeEasy/Core/Plugin/Seeder.html +0 -674
  35. data/doc/AeEasy/Core/Plugin/SeederBehavior.html +0 -142
  36. data/doc/AeEasy/Core/SmartCollection.html +0 -1087
  37. data/doc/_index.html +0 -364
  38. data/doc/class_list.html +0 -51
  39. data/doc/css/common.css +0 -1
  40. data/doc/css/full_list.css +0 -58
  41. data/doc/css/style.css +0 -496
  42. data/doc/file.README.html +0 -91
  43. data/doc/file_list.html +0 -56
  44. data/doc/frames.html +0 -17
  45. data/doc/index.html +0 -91
  46. data/doc/js/app.js +0 -303
  47. data/doc/js/full_list.js +0 -216
  48. data/doc/js/jquery.js +0 -4
  49. data/doc/method_list.html +0 -939
  50. data/doc/top-level-namespace.html +0 -110
  51. data/lib/ae_easy/core/config.rb +0 -27
  52. data/lib/ae_easy/core/exception.rb +0 -8
  53. data/lib/ae_easy/core/exception/outdated_error.rb +0 -9
  54. data/lib/ae_easy/core/helper.rb +0 -8
  55. data/lib/ae_easy/core/helper/cookie.rb +0 -209
  56. data/lib/ae_easy/core/mock.rb +0 -45
  57. data/lib/ae_easy/core/mock/fake_db.rb +0 -561
  58. data/lib/ae_easy/core/mock/fake_executor.rb +0 -373
  59. data/lib/ae_easy/core/mock/fake_finisher.rb +0 -28
  60. data/lib/ae_easy/core/mock/fake_parser.rb +0 -33
  61. data/lib/ae_easy/core/mock/fake_seeder.rb +0 -28
  62. data/lib/ae_easy/core/plugin.rb +0 -19
  63. data/lib/ae_easy/core/plugin/collection_vault.rb +0 -23
  64. data/lib/ae_easy/core/plugin/config_behavior.rb +0 -43
  65. data/lib/ae_easy/core/plugin/context_integrator.rb +0 -60
  66. data/lib/ae_easy/core/plugin/executor.rb +0 -19
  67. data/lib/ae_easy/core/plugin/executor_behavior.rb +0 -32
  68. data/lib/ae_easy/core/plugin/finisher.rb +0 -19
  69. data/lib/ae_easy/core/plugin/finisher_behavior.rb +0 -9
  70. data/lib/ae_easy/core/plugin/initialize_hook.rb +0 -17
  71. data/lib/ae_easy/core/plugin/parser.rb +0 -19
  72. data/lib/ae_easy/core/plugin/parser_behavior.rb +0 -17
  73. data/lib/ae_easy/core/plugin/seeder.rb +0 -44
  74. data/lib/ae_easy/core/plugin/seeder_behavior.rb +0 -9
  75. data/lib/ae_easy/core/smart_collection.rb +0 -236
  76. data/lib/ae_easy/core/version.rb +0 -6
@@ -1,110 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>
7
- Top Level Namespace
8
-
9
- &mdash; Documentation by YARD 0.9.20
10
-
11
- </title>
12
-
13
- <link rel="stylesheet" href="css/style.css" type="text/css" charset="utf-8" />
14
-
15
- <link rel="stylesheet" href="css/common.css" type="text/css" charset="utf-8" />
16
-
17
- <script type="text/javascript" charset="utf-8">
18
- pathId = "";
19
- relpath = '';
20
- </script>
21
-
22
-
23
- <script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
24
-
25
- <script type="text/javascript" charset="utf-8" src="js/app.js"></script>
26
-
27
-
28
- </head>
29
- <body>
30
- <div class="nav_wrap">
31
- <iframe id="nav" src="class_list.html?1"></iframe>
32
- <div id="resizer"></div>
33
- </div>
34
-
35
- <div id="main" tabindex="-1">
36
- <div id="header">
37
- <div id="menu">
38
-
39
- <a href="_index.html">Index</a> &raquo;
40
-
41
-
42
- <span class="title">Top Level Namespace</span>
43
-
44
- </div>
45
-
46
- <div id="search">
47
-
48
- <a class="full_list_link" id="class_list_link"
49
- href="class_list.html">
50
-
51
- <svg width="24" height="24">
52
- <rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
53
- <rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
54
- <rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
55
- </svg>
56
- </a>
57
-
58
- </div>
59
- <div class="clear"></div>
60
- </div>
61
-
62
- <div id="content"><h1>Top Level Namespace
63
-
64
-
65
-
66
- </h1>
67
- <div class="box_info">
68
-
69
-
70
-
71
-
72
-
73
-
74
-
75
-
76
-
77
-
78
-
79
- </div>
80
-
81
- <h2>Defined Under Namespace</h2>
82
- <p class="children">
83
-
84
-
85
- <strong class="modules">Modules:</strong> <span class='object_link'><a href="AeEasy.html" title="AeEasy (module)">AeEasy</a></span>
86
-
87
-
88
-
89
-
90
- </p>
91
-
92
-
93
-
94
-
95
-
96
-
97
-
98
-
99
-
100
- </div>
101
-
102
- <div id="footer">
103
- Generated on Fri Sep 27 02:01:30 2019 by
104
- <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
105
- 0.9.20 (ruby-2.5.3).
106
- </div>
107
-
108
- </div>
109
- </body>
110
- </html>
@@ -1,27 +0,0 @@
1
- module AeEasy
2
- module Core
3
- # Configuration manager tool useful for global configuration data accross
4
- # the scraping process.
5
- class Config
6
- include AeEasy::Core::Plugin::InitializeHook
7
- include AeEasy::Core::Plugin::ConfigBehavior
8
-
9
- # {AeEasy::Core::Plugin::ConfigBehavior#config_collection_key}
10
- alias :collection_key :config_collection_key
11
- # {AeEasy::Core::Plugin::ConfigBehavior#config_collection}
12
- alias :collection :config_collection
13
-
14
- # Initialize config object
15
- #
16
- # @param [Hash] opts ({}) Configuration options.
17
- #
18
- # @see AeEasy::Core::Plugin::ConfigBehavior#initialize_hook_core_config_behavior
19
- def initialize opts = {}
20
- opts = opts.merge(
21
- config_collection: opts[:collection]
22
- )
23
- initialize_hooks opts
24
- end
25
- end
26
- end
27
- end
@@ -1,8 +0,0 @@
1
- require 'ae_easy/core/exception/outdated_error'
2
-
3
- module AeEasy
4
- module Core
5
- module Exception
6
- end
7
- end
8
- end
@@ -1,9 +0,0 @@
1
- module AeEasy
2
- module Core
3
- module Exception
4
- # Exception that indicates something is outdated error.
5
- class OutdatedError < StandardError
6
- end
7
- end
8
- end
9
- end
@@ -1,8 +0,0 @@
1
- require 'ae_easy/core/helper/cookie'
2
-
3
- module AeEasy
4
- module Core
5
- module Helper
6
- end
7
- end
8
- end
@@ -1,209 +0,0 @@
1
- module AeEasy
2
- module Core
3
- module Helper
4
- # Helper used for lower level cookie management.
5
- class Cookie
6
- class << self
7
- # Parse request cookies on different formats.
8
- #
9
- # @param [String,Hash,Array] cookies Cookies to parse.
10
- # @param [Hash] cookie_hash ({}) External hash to store parsed cookies.
11
- #
12
- # @return [Hash]
13
- #
14
- # @example Parse from string.
15
- # parse_from_request 'aaa=111; bbb=222'
16
- # # => {'aaa' => 111, 'bbb' => 222}
17
- #
18
- # @example Parse from array.
19
- # cookies = [
20
- # 'aaa=111',
21
- # 'bbb=222'
22
- # ]
23
- # parse_from_response cookies
24
- # # => {'aaa' => 111, 'bbb' => 222}
25
- #
26
- # @example Parse with `cookie_hash`.
27
- # cookie_hash = {'ccc' => 333}
28
- # parse_from_request 'aaa=111; bbb=222', cookie_hash
29
- # cookie_hash
30
- # # => {'aaa' => 1, 'bbb' => 2, 'ccc' => 333}
31
- def parse_from_request cookies, cookie_hash = {}
32
- # Retrieve from hash
33
- if cookies.is_a? Hash
34
- cookie_hash.merge! cookies
35
- return cookie_hash
36
- end
37
-
38
- # Extract from string
39
- cookies = cookies.split '; ' if cookies.is_a? String
40
-
41
- # Extract from array
42
- cookies&.each do |raw_cookie|
43
- key, value = raw_cookie.split('=', 2)
44
- cookie_hash[key] = value
45
- end
46
- cookie_hash
47
- end
48
-
49
- # Parse response cookies on different formats.
50
- #
51
- # @param [String,Hash,Array] cookies Cookies to parse.
52
- # @param [Hash] cookie_hash ({}) External hash to store parsed cookies.
53
- #
54
- # @return [Hash]
55
- #
56
- # @example Parse from string
57
- # parse_from_response 'aaa=111; bbb=222'
58
- # # => {'aaa' => 111, 'bbb' => 222}
59
- #
60
- # @example Parse from array.
61
- # cookies = [
62
- # 'aaa=111; Expires=Thu, Jan 01 1970 00:00:00 UTC; path=/',
63
- # 'bbb=222; path=/',
64
- # 'ccc=333; path=/; expires=Wed, Jan 01 3000 00:00:00 UTC'
65
- # ]
66
- # parse_from_response cookies
67
- # # => {'bbb' => 222, 'ccc' => 333}
68
- #
69
- # @example Parse with `cookie_hash`.
70
- # cookie_hash = {'ccc' => 333}
71
- # parse_from_response 'aaa=111; bbb=222', cookie_hash
72
- # cookie_hash
73
- # # => {'aaa' => 111, 'bbb' => 222, 'ccc' => 333}
74
- def parse_from_response cookies, cookie_hash = {}
75
- # Retrieve from hash
76
- if cookies.is_a? Hash
77
- cookie_hash.merge! cookies
78
- return cookie_hash
79
- end
80
- # Retrieve from String
81
- cookies = cookies.split '; ' if cookies.is_a? String
82
-
83
- # Extract from array
84
- info = cookie = expires = key = value = nil
85
- cookies&.each do |raw_cookie|
86
- # Extract cookie data
87
- key_pair = raw_cookie.scan(/(?:;\s+([^\=]+)=([^;]*))/i) || []
88
- cookie = key_pair.inject(Hash.new){|h,i|h[i[0].to_s.downcase] = i[1]; h}
89
- cookie[:key], cookie[:value] = raw_cookie.match(/^\s*(?<key>[^\=]+)\=(?<value>[^;]*)/i)&.captures
90
-
91
- # Check cookie expire
92
- expires = cookie['expires'].nil? ? nil : Time.parse(cookie['expires'])
93
- if !expires.nil? && Time.now > expires
94
- cookie_hash.delete cookie[:key]
95
- next
96
- end
97
-
98
- # Save cookie
99
- cookie_hash[cookie[:key]] = cookie[:value]
100
- end
101
- cookie_hash
102
- end
103
-
104
- # Apply request and response cookies as a hash.
105
- #
106
- # @param [String,Array,Hash] request_cookies Cookies to parse.
107
- # @param [String,Array,Hash] response_cookies Cookies to parse.
108
- #
109
- # @return [Hash]
110
- #
111
- # @example
112
- # request_cookies = 'aaa=111; ddd=444'
113
- # response_cookies = [
114
- # 'aaa=111; Expires=Thu, Jan 01 1970 00:00:00 UTC; path=/',
115
- # 'bbb=222; path=/',
116
- # 'ccc=333; path=/; expires=Wed, Jan 01 3000 00:00:00 UTC'
117
- # ]
118
- # update_as_hash , response_cookies
119
- # # => {'bbb' => 222, 'ccc' => 333, 'ddd' => 444}
120
- def update_as_hash request_cookies, response_cookies
121
- cookie_hash = {}
122
- parse_from_request request_cookies, cookie_hash
123
- parse_from_response response_cookies, cookie_hash
124
- cookie_hash
125
- end
126
-
127
- # Encode cookies as request cookie string.
128
- #
129
- # @param [Hash] cookie_hash Hash with cookies.
130
- #
131
- # @return [String]
132
- #
133
- # @example
134
- # cookie_hash = {
135
- # 'aaa' => 111,
136
- # 'bbb' => 222
137
- # }
138
- # encode_to_header cookie_hash
139
- # # => 'aaa=111; bbb=222'
140
- def encode_to_header cookie_hash
141
- cookie_hash.map{|k,v| "#{k}=#{v}"}.join '; '
142
- end
143
-
144
- # Apply request and response cookies as a string with request format.
145
- #
146
- # @param [String,Array,Hash] request_cookies Cookies to parse.
147
- # @param [String,Array,Hash] response_cookies Cookies to parse.
148
- #
149
- # @return [String]
150
- #
151
- # @example
152
- # request_cookies = 'aaa=111; ddd=444'
153
- # response_cookies = [
154
- # 'aaa=111; Expires=Thu, Jan 01 1970 00:00:00 UTC; path=/',
155
- # 'bbb=222; path=/',
156
- # 'ccc=333; path=/; expires=Wed, Jan 01 3000 00:00:00 UTC'
157
- # ]
158
- # update_as_hash , response_cookies
159
- # # => 'bbb=222; ccc=333; ddd=444'
160
- def update request_cookies, response_cookies
161
- cookie_hash = update_as_hash request_cookies, response_cookies
162
- encode_to_header cookie_hash
163
- end
164
-
165
- # Compare if cookie is included into base cookie
166
- #
167
- # @param [Hash] base_cookie_hash Hash that represent universe.
168
- # @param [Hash] cookie_hash Hash that represents to compare.
169
- #
170
- # @return [Boolean]
171
- #
172
- # @example Check a success match.
173
- # base_cookie_hash = {
174
- # 'aaa' => 111,
175
- # 'bbb' => 222,
176
- # 'ccc' => 333,
177
- # 'ddd' => 444
178
- # }
179
- # cookie_hash = {
180
- # 'bbb' => 222,
181
- # 'ddd' => 444
182
- # }
183
- # include? base_cookie_hash, cookie_hash
184
- # # => true
185
- #
186
- # @example Check with fail match.
187
- # base_cookie_hash = {
188
- # 'aaa' => 111,
189
- # 'bbb' => 222,
190
- # 'ccc' => 333,
191
- # 'ddd' => 444
192
- # }
193
- # cookie_hash = {
194
- # 'bbb' => 555,
195
- # 'ddd' => 444
196
- # }
197
- # include? base_cookie_hash, cookie_hash
198
- # # => false
199
- def include? base_cookie_hash, cookie_hash
200
- cookie_hash.each do |key, value|
201
- return false unless base_cookie_hash.has_key?(key) && base_cookie_hash[key] == value
202
- end
203
- true
204
- end
205
- end
206
- end
207
- end
208
- end
209
- end
@@ -1,45 +0,0 @@
1
- require 'ae_easy/core/mock/fake_db'
2
- require 'ae_easy/core/mock/fake_executor'
3
- require 'ae_easy/core/mock/fake_parser'
4
- require 'ae_easy/core/mock/fake_seeder'
5
- require 'ae_easy/core/mock/fake_finisher'
6
-
7
- module AeEasy
8
- module Core
9
- module Mock
10
- # Generate a context and message queue from a list of exposed methods.
11
- #
12
- # @param [Array] exposed_methods List of exposed methods.
13
- #
14
- # @example
15
- # exposed_methods = [:boo, :bar]
16
- # context, message_queue = AeEasy::Core::Mock.context_vars exposed_methods
17
- # context.boo 1, 2
18
- # context.bar 'A', 'B'
19
- # context.bar '111', '222'
20
- # message_queue
21
- # # => [
22
- # # [:boo, [1, 2]],
23
- # # [:bar, ['A', 'B']],
24
- # # [:bar, ['111', '222']]
25
- # # ]
26
- #
27
- # @return [Array] `[context, message_queue]` being:
28
- # * `context`: Object implementing exposed methods.
29
- # * `[Array] message_queue`: Array to store messages.
30
- def self.context_vars exposed_methods
31
- context = Object.new
32
- metaclass = class << context; self; end
33
- message_queue = [] # Beat reference bug
34
- exposed_methods = exposed_methods
35
- exposed_methods.each do |key|
36
- metaclass.send(:define_method, key) do |*args|
37
- # Record all method calls into message queue for easy access
38
- message_queue << [key, args]
39
- end
40
- end
41
- [context, message_queue]
42
- end
43
- end
44
- end
45
- end
@@ -1,561 +0,0 @@
1
- module AeEasy
2
- module Core
3
- module Mock
4
- # Fake in memory database that emulates `Answersengine` database objects' black box behavior.
5
- class FakeDb
6
- # Page id keys, analog to primary keys.
7
- PAGE_KEYS = ['gid'].freeze
8
- # Output id keys, analog to primary keys.
9
- OUTPUT_KEYS = ['_id', '_collection'].freeze
10
- # Job id keys, analog to primary keys.
11
- JOB_KEYS = ['job_id'].freeze
12
- # Job available status.
13
- JOB_STATUSES = {
14
- active: 'active',
15
- done: 'done',
16
- cancelled: 'cancelled',
17
- paused: 'paused'
18
- }
19
- # Default collection for saved outputs
20
- DEFAULT_COLLECTION = 'default'
21
-
22
- # Generate a smart collection with keys and initial values.
23
- #
24
- # @param [Array] keys Analog to primary keys, combination will be uniq.
25
- # @param [Hash] opts Configuration options (see AeEasy::Core::SmartCollection#initialize).
26
- #
27
- # @return [AeEasy::Core::SmartCollection]
28
- def self.new_collection keys, opts = {}
29
- AeEasy::Core::SmartCollection.new keys, opts
30
- end
31
-
32
- # Generate a fake UUID.
33
- #
34
- # @param seed (nil) Object to use as seed for uuid.
35
- #
36
- # @return [String]
37
- def self.fake_uuid seed = nil
38
- seed ||= (Time.new.to_f + rand)
39
- Digest::SHA1.hexdigest seed.to_s
40
- end
41
-
42
- # Generate a fake UUID based on output fields without `_` prefix.
43
- #
44
- # @param [Hash] data Output data.
45
- #
46
- # @return [String]
47
- def self.output_uuid data
48
- seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
49
- fake_uuid seed
50
- end
51
-
52
- # Build a page with defaults by using FakeDb engine.
53
- #
54
- # @param [Hash] page Page initial values.
55
- # @param [Hash] opts ({}) Configuration options (see #initialize).
56
- #
57
- # @return [Hash]
58
- def self.build_page page, opts = {}
59
- opts = {
60
- allow_page_gid_override: true,
61
- allow_job_id_override: true
62
- }.merge opts
63
- temp_db = AeEasy::Core::Mock::FakeDb.new opts
64
- temp_db.pages << page
65
- temp_db.pages.first
66
- end
67
-
68
- # Build a fake page by using FakeDb engine.
69
- #
70
- # @param [Hash] opts ({}) Configuration options (see #initialize).
71
- # @option opts [String] :url ('https://example.com') Page url.
72
- #
73
- # @return [Hash]
74
- def self.build_fake_page opts = {}
75
- page = {
76
- 'url' => (opts[:url] || 'https://example.com')
77
- }
78
- build_page page, opts
79
- end
80
-
81
- # Clean an URL to remove fragment, lowercase schema and host, and sort
82
- # query string.
83
- #
84
- # @param [String] raw_url URL to clean.
85
- #
86
- # @return [String]
87
- def self.clean_uri raw_url
88
- url = URI.parse(raw_url)
89
- url.hostname = url.hostname.downcase
90
- url.fragment = nil
91
-
92
- # Sort query string keys
93
- unless url.query.nil?
94
- query_string = CGI.parse(url.query)
95
- keys = query_string.keys.sort
96
- data = []
97
- keys.each do |key|
98
- query_string[key].each do |value|
99
- data << "#{URI.encode key}=#{URI.encode value}"
100
- end
101
- end
102
- url.query = data.join('&')
103
- end
104
- url.to_s
105
- end
106
-
107
- # Format headers for gid generation.
108
- # @private
109
- #
110
- # @param [Hash,nil] headers Headers hash.
111
- #
112
- # @return [Hash]
113
- def self.format_headers headers
114
- return {} if headers.nil?
115
- data = {}
116
- headers.each do |key, value|
117
- unless value.is_a? Array
118
- data[key] = value
119
- next
120
- end
121
- data[key] = value.sort
122
- end
123
- data
124
- end
125
-
126
- # Build a job with defaults by using FakeDb engine.
127
- #
128
- # @param [Hash] job Job initial values.
129
- # @param [Hash] opts ({}) Configuration options (see #initialize).
130
- #
131
- # @return [Hash]
132
- def self.build_job job, opts = {}
133
- temp_db = AeEasy::Core::Mock::FakeDb.new opts
134
- temp_db.jobs << job
135
- temp_db.jobs.last
136
- end
137
-
138
- # Build a fake job by using FakeDb engine.
139
- #
140
- # @param [Hash] opts ({}) Configuration options (see #initialize).
141
- # @option opts [String] :scraper_name (nil) Scraper name.
142
- # @option opts [Integer] :job_id (nil) Job id.
143
- # @option opts [String] :status ('done').
144
- #
145
- # @return [Hash]
146
- def self.build_fake_job opts = {}
147
- job = {
148
- 'job_id' => opts[:job_id],
149
- 'scraper_name' => opts[:scraper_name],
150
- 'status' => (opts[:status] || 'done')
151
- }
152
- build_job job, opts
153
- end
154
-
155
- # Return a timestamp
156
- #
157
- # @param [Time] time (nil) Time from which to get time stamp.
158
- #
159
- # @return [String]
160
- def self.time_stamp time = nil
161
- time = Time.new if time.nil?
162
- time.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
163
- end
164
-
165
- # Get current job or create new one from values.
166
- #
167
- # @param [Integer] target_job_id (nil) Job id to ensure existance.
168
- #
169
- # @return [Hash]
170
- def ensure_job target_job_id = nil
171
- target_job_id = job_id if target_job_id.nil?
172
- job = jobs.find{|v|v['job_id'] == target_job_id}
173
- return job unless job.nil?
174
- job = {
175
- 'job_id' => target_job_id,
176
- 'scraper_name' => scraper_name,
177
- }
178
- job['status'] = 'active' unless target_job_id != job_id
179
- jobs << job
180
- jobs.last
181
- end
182
-
183
- # Fake scraper_name.
184
- # @return [String,nil]
185
- def scraper_name
186
- @scraper_name ||= 'my_scraper'
187
- end
188
-
189
- # Set fake scraper_name value.
190
- def scraper_name= value
191
- job = ensure_job
192
- @scraper_name = value
193
- job['scraper_name'] = scraper_name
194
- end
195
-
196
- # Fake job id.
197
- # @return [Integer,nil]
198
- def job_id
199
- @job_id ||= generate_job_id
200
- end
201
-
202
- # Set fake job id value.
203
- def job_id= value
204
- @job_id = value
205
- ensure_job
206
- job_id
207
- end
208
-
209
- # Current fake page gid.
210
- # @return [Integer,nil]
211
- def page_gid
212
- @page_gid ||= self.class.fake_uuid
213
- end
214
-
215
- # Set current fake page gid value.
216
- def page_gid= value
217
- @page_gid = value
218
- end
219
-
220
- # Enable page gid override on page or output insert.
221
- def enable_page_gid_override
222
- @allow_page_gid_override = true
223
- end
224
-
225
- # Disable page gid override on page or output insert.
226
- def disable_page_gid_override
227
- @allow_page_gid_override = false
228
- end
229
-
230
- # Specify whenever page gid overriding by user is allowed on page or
231
- # output insert.
232
- #
233
- # @return [Boolean] `true` when allowed, else `false`.
234
- def allow_page_gid_override?
235
- @allow_page_gid_override ||= false
236
- end
237
-
238
- # Enable job id override on page or output insert.
239
- def enable_job_id_override
240
- @allow_job_id_override = true
241
- end
242
-
243
- # Disable job id override on page or output insert.
244
- def disable_job_id_override
245
- @allow_job_id_override = false
246
- end
247
-
248
- # Specify whenever job id overriding by user is allowed on page or
249
- # output insert.
250
- #
251
- # @return [Boolean] `true` when allowed, else `false`.
252
- def allow_job_id_override?
253
- @allow_job_id_override ||= false
254
- end
255
-
256
- # Initialize fake database.
257
- #
258
- # @param [Hash] opts ({}) Configuration options.
259
- # @option opts [Integer,nil] :job_id Job id default value.
260
- # @option opts [String,nil] :scraper_name Scraper name default value.
261
- # @option opts [String,nil] :page_gid Page gid default value.
262
- # @option opts [Boolean, nil] :allow_page_gid_override (false) Specify
263
- # whenever page gid can be overrided on page or output insert.
264
- # @option opts [Boolean, nil] :allow_job_id_override (false) Specify
265
- # whenever job id can be overrided on page or output insert.
266
- def initialize opts = {}
267
- self.job_id = opts[:job_id]
268
- self.scraper_name = opts[:scraper_name]
269
- self.page_gid = opts[:page_gid]
270
- @allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
271
- @allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
272
- end
273
-
274
- # Generate a fake scraper name.
275
- #
276
- # @return [String]
277
- def generate_scraper_name
278
- Faker::Internet.unique.slug
279
- end
280
-
281
- # Generate a fake job_id.
282
- #
283
- # @return [Integer]
284
- def generate_job_id
285
- jobs.count < 1 ? 1 : (jobs.max{|a,b|a['job_id'] <=> b['job_id']}['job_id'] + 1)
286
- end
287
-
288
- # Get output keys with key generators to emulate saving on db.
289
- # @private
290
- #
291
- # @return [Hash]
292
- def job_defaults
293
- @job_defaults ||= {
294
- 'job_id' => lambda{|job| generate_job_id},
295
- 'scraper_name' => lambda{|job| generate_scraper_name},
296
- 'status' => 'done',
297
- 'created_at' => lambda{|job| Time.now}
298
- }
299
- end
300
-
301
- # Stored job collection
302
- #
303
- # @return [AeEasy::Core::SmartCollection]
304
- def jobs
305
- return @jobs unless @jobs.nil?
306
- collection = self.class.new_collection JOB_KEYS,
307
- defaults: job_defaults
308
- collection.bind_event(:before_defaults) do |collection, raw_item|
309
- AeEasy::Core.deep_stringify_keys raw_item
310
- end
311
- collection.bind_event(:before_insert) do |collection, item, match|
312
- item['job_id'] ||= generate_job_id
313
- item
314
- end
315
- @jobs ||= collection
316
- end
317
-
318
- # Generate a fake UUID based on page data:
319
- # * url
320
- # * method
321
- # * headers
322
- # * fetch_type
323
- # * cookie
324
- # * no_redirect
325
- # * body
326
- # * ua_type
327
- #
328
- # @param [Hash] page_data Page data.
329
- #
330
- # @return [String]
331
- def generate_page_gid page_data
332
- fields = [
333
- 'url',
334
- 'method',
335
- 'headers',
336
- 'fetch_type',
337
- 'cookie',
338
- 'no_redirect',
339
- 'body',
340
- 'ua_type'
341
- ]
342
- data = page_data.select{|k,v|fields.include? k}
343
- data['url'] = self.class.clean_uri data['url']
344
- data['headers'] = self.class.format_headers data['headers']
345
- data['cookie'] = AeEasy::Core::Helper::Cookie.parse_from_request data['cookie'] unless data['cookie'].nil?
346
- seed = data.select{|k,v|fields.include? k}.hash
347
- checksum = self.class.fake_uuid seed
348
- "#{URI.parse(data['url']).hostname}-#{checksum}"
349
- end
350
-
351
- # Get page keys with key generators to emulate saving on db.
352
- # @private
353
- #
354
- # @return [Hash]
355
- def page_defaults
356
- @page_defaults ||= {
357
- 'url' => nil,
358
- 'status' => 'to_fetch',
359
- 'job_id' => lambda{|page| job_id},
360
- 'method' => 'GET',
361
- 'headers' => {},
362
- 'fetch_type' => 'standard',
363
- 'cookie' => nil,
364
- 'no_redirect' => false,
365
- 'body' => nil,
366
- 'ua_type' => 'desktop',
367
- 'no_url_encode' => false,
368
- 'http2' => false,
369
- 'vars' => {}
370
- }
371
- end
372
-
373
- # Stored page collection.
374
- #
375
- # @return [AeEasy::Core::SmartCollection]
376
- #
377
- # @note Page gid will be replaced on insert by an auto generated uuid
378
- # unless page gid overriding is enabled
379
- # (see #allow_page_gid_override?)
380
- def pages
381
- return @pages unless @page.nil?
382
-
383
- collection = self.class.new_collection PAGE_KEYS,
384
- defaults: page_defaults
385
- collection.bind_event(:before_defaults) do |collection, raw_item|
386
- item = AeEasy::Core.deep_stringify_keys raw_item
387
- item.delete 'job_id' unless allow_job_id_override?
388
- item
389
- end
390
- collection.bind_event(:before_insert) do |collection, item, match|
391
- if item['gid'].nil? || !allow_page_gid_override?
392
- item['gid'] = generate_page_gid item
393
- end
394
- item
395
- end
396
- collection.bind_event(:after_insert) do |collection, item|
397
- ensure_job item['job_id']
398
- end
399
- @pages ||= collection
400
- end
401
-
402
- # Generate a fake UUID for outputs.
403
- #
404
- # @param [Hash] data Output data.
405
- #
406
- # @return [String]
407
- def generate_output_id data
408
- # Generate random UUID to match AnswersEngine behavior
409
- self.class.fake_uuid
410
- end
411
-
412
- # Get output keys with key generators to emulate saving on db.
413
- # @private
414
- #
415
- # @return [Hash]
416
- def output_defaults
417
- @output_defaults ||= {
418
- '_collection' => DEFAULT_COLLECTION,
419
- '_job_id' => lambda{|output| job_id},
420
- '_created_at' => lambda{|output| self.class.time_stamp},
421
- '_gid' => lambda{|output| page_gid}
422
- }
423
- end
424
-
425
- # Stored output collection
426
- #
427
- # @return [AeEasy::Core::SmartCollection]
428
- def outputs
429
- return @outputs unless @outputs.nil?
430
- collection = self.class.new_collection OUTPUT_KEYS,
431
- defaults: output_defaults
432
- collection.bind_event(:before_defaults) do |collection, raw_item|
433
- item = AeEasy::Core.deep_stringify_keys raw_item
434
- item.delete '_job_id' unless allow_job_id_override?
435
- item.delete '_gid_id' unless allow_page_gid_override?
436
- item
437
- end
438
- collection.bind_event(:before_insert) do |collection, item, match|
439
- item['_id'] ||= generate_output_id item
440
- item
441
- end
442
- collection.bind_event(:after_insert) do |collection, item|
443
- ensure_job item['_job_id']
444
- end
445
- @outputs ||= collection
446
- end
447
-
448
- # Match data to filters.
449
- # @private
450
- #
451
- # @param data Hash containing data.
452
- # @param filters Filters to apply on match.
453
- #
454
- # @return [Boolean]
455
- #
456
- # @note Missing and `nil` values on `data` will match when `filters`'
457
- # field is `nil`.
458
- def match? data, filters
459
- filters.each do |key, value|
460
- return false if data[key] != value
461
- end
462
- true
463
- end
464
-
465
- # Search items from a collection.
466
- #
467
- # @param [Symbol] collection Allowed values: `:outputs`, `:pages`.
468
- # @param [Hash] filter Filters to query.
469
- # @param [Integer] offset (0) Search results offset.
470
- # @param [Integer,nil] limit (nil) Limit search results count. Set to `nil` for unlimited.
471
- #
472
- # @raise ArgumentError On unknown collection.
473
- #
474
- # @note _Warning:_ It uses table scan to filter and should be used on test suites only.
475
- def query collection, filter, offset = 0, limit = nil
476
- return [] unless limit.nil? || limit > 0
477
-
478
- # Get collection items
479
- items = case collection
480
- when :outputs
481
- outputs
482
- when :pages
483
- pages
484
- when :jobs
485
- jobs
486
- else
487
- raise ArgumentError.new "Unknown collection #{collection}."
488
- end
489
-
490
- # Search items
491
- count = 0
492
- matches = []
493
- items.each do |item|
494
- next unless match? item, filter
495
- count += 1
496
-
497
- # Skip until offset
498
- next unless offset < count
499
- # Break on limit reach
500
- break unless limit.nil? || matches.count < limit
501
- matches << item
502
- end
503
- matches
504
- end
505
-
506
- # Refetch a page.
507
- #
508
- # @param [Integer] job_id Page's job_id to refetch.
509
- # @param [String] gid Page's gid to refetch.
510
- def refetch job_id, gid
511
- page = pages.find_match('gid' => gid, 'job_id' => job_id)
512
- raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
513
- page['status'] = 'to_fetch'
514
- page['freshness'] = self.class.time_stamp
515
- page['to_fetch'] = self.class.time_stamp
516
- page['fetched_from'] = nil
517
- page['fetching_at'] = '2001-01-01T00:00:00Z'
518
- page['fetched_at'] = nil
519
- page['fetching_try_count'] = 0
520
- page['effective_url'] = nil
521
- page['parsing_at'] = nil
522
- page['parsing_failed_at'] = nil
523
- page['parsed_at'] = nil
524
- page['parsing_try_count'] = 0
525
- page['parsing_fail_count'] = 0
526
- page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
527
- page['response_checksum'] = nil
528
- page['response_status'] = nil
529
- page['response_status_code'] = nil
530
- page['response_headers'] = nil
531
- page['response_cookie'] = nil
532
- page['response_proto'] = nil
533
- page['content_type'] = nil
534
- page['content_size'] = 0
535
- page['failed_response_status_code'] = nil
536
- page['failed_response_headers'] = nil
537
- page['failed_response_cookie'] = nil
538
- page['failed_effective_url'] = nil
539
- page['failed_at'] = nil
540
- page['failed_content_type'] = nil
541
- end
542
-
543
- # Reparse a page.
544
- #
545
- # @param [Integer] job_id Page's job_id to reparse.
546
- # @param [String] gid Page's gid to reparse.
547
- def reparse job_id, gid
548
- page = pages.find_match('gid' => gid, 'job_id' => job_id)
549
- raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
550
- page['status'] = 'to_parse'
551
- page['parsing_at'] = nil
552
- page['parsing_failed_at'] = nil
553
- page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
554
- page['parsed_at'] = nil
555
- page['parsing_try_count'] = 0
556
- page['parsing_fail_count'] = 0
557
- end
558
- end
559
- end
560
- end
561
- end