ae_easy-core 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/CODE_OF_CONDUCT.md +1 -1
  3. data/Gemfile +1 -1
  4. data/LICENSE +1 -1
  5. data/README.md +8 -4
  6. data/Rakefile +0 -10
  7. data/ae_easy-core.gemspec +6 -13
  8. data/lib/ae_easy/core.rb +4 -256
  9. metadata +18 -125
  10. data/doc/AeEasy.html +0 -117
  11. data/doc/AeEasy/Core.html +0 -1590
  12. data/doc/AeEasy/Core/Config.html +0 -311
  13. data/doc/AeEasy/Core/Exception.html +0 -117
  14. data/doc/AeEasy/Core/Exception/OutdatedError.html +0 -135
  15. data/doc/AeEasy/Core/Helper.html +0 -117
  16. data/doc/AeEasy/Core/Helper/Cookie.html +0 -1070
  17. data/doc/AeEasy/Core/Mock.html +0 -282
  18. data/doc/AeEasy/Core/Mock/FakeDb.html +0 -3779
  19. data/doc/AeEasy/Core/Mock/FakeExecutor.html +0 -3289
  20. data/doc/AeEasy/Core/Mock/FakeFinisher.html +0 -160
  21. data/doc/AeEasy/Core/Mock/FakeParser.html +0 -160
  22. data/doc/AeEasy/Core/Mock/FakeSeeder.html +0 -160
  23. data/doc/AeEasy/Core/Plugin.html +0 -117
  24. data/doc/AeEasy/Core/Plugin/CollectionVault.html +0 -299
  25. data/doc/AeEasy/Core/Plugin/ConfigBehavior.html +0 -541
  26. data/doc/AeEasy/Core/Plugin/ContextIntegrator.html +0 -445
  27. data/doc/AeEasy/Core/Plugin/Executor.html +0 -259
  28. data/doc/AeEasy/Core/Plugin/ExecutorBehavior.html +0 -344
  29. data/doc/AeEasy/Core/Plugin/Finisher.html +0 -265
  30. data/doc/AeEasy/Core/Plugin/FinisherBehavior.html +0 -142
  31. data/doc/AeEasy/Core/Plugin/InitializeHook.html +0 -220
  32. data/doc/AeEasy/Core/Plugin/Parser.html +0 -270
  33. data/doc/AeEasy/Core/Plugin/ParserBehavior.html +0 -235
  34. data/doc/AeEasy/Core/Plugin/Seeder.html +0 -674
  35. data/doc/AeEasy/Core/Plugin/SeederBehavior.html +0 -142
  36. data/doc/AeEasy/Core/SmartCollection.html +0 -1087
  37. data/doc/_index.html +0 -364
  38. data/doc/class_list.html +0 -51
  39. data/doc/css/common.css +0 -1
  40. data/doc/css/full_list.css +0 -58
  41. data/doc/css/style.css +0 -496
  42. data/doc/file.README.html +0 -91
  43. data/doc/file_list.html +0 -56
  44. data/doc/frames.html +0 -17
  45. data/doc/index.html +0 -91
  46. data/doc/js/app.js +0 -303
  47. data/doc/js/full_list.js +0 -216
  48. data/doc/js/jquery.js +0 -4
  49. data/doc/method_list.html +0 -939
  50. data/doc/top-level-namespace.html +0 -110
  51. data/lib/ae_easy/core/config.rb +0 -27
  52. data/lib/ae_easy/core/exception.rb +0 -8
  53. data/lib/ae_easy/core/exception/outdated_error.rb +0 -9
  54. data/lib/ae_easy/core/helper.rb +0 -8
  55. data/lib/ae_easy/core/helper/cookie.rb +0 -209
  56. data/lib/ae_easy/core/mock.rb +0 -45
  57. data/lib/ae_easy/core/mock/fake_db.rb +0 -561
  58. data/lib/ae_easy/core/mock/fake_executor.rb +0 -373
  59. data/lib/ae_easy/core/mock/fake_finisher.rb +0 -28
  60. data/lib/ae_easy/core/mock/fake_parser.rb +0 -33
  61. data/lib/ae_easy/core/mock/fake_seeder.rb +0 -28
  62. data/lib/ae_easy/core/plugin.rb +0 -19
  63. data/lib/ae_easy/core/plugin/collection_vault.rb +0 -23
  64. data/lib/ae_easy/core/plugin/config_behavior.rb +0 -43
  65. data/lib/ae_easy/core/plugin/context_integrator.rb +0 -60
  66. data/lib/ae_easy/core/plugin/executor.rb +0 -19
  67. data/lib/ae_easy/core/plugin/executor_behavior.rb +0 -32
  68. data/lib/ae_easy/core/plugin/finisher.rb +0 -19
  69. data/lib/ae_easy/core/plugin/finisher_behavior.rb +0 -9
  70. data/lib/ae_easy/core/plugin/initialize_hook.rb +0 -17
  71. data/lib/ae_easy/core/plugin/parser.rb +0 -19
  72. data/lib/ae_easy/core/plugin/parser_behavior.rb +0 -17
  73. data/lib/ae_easy/core/plugin/seeder.rb +0 -44
  74. data/lib/ae_easy/core/plugin/seeder_behavior.rb +0 -9
  75. data/lib/ae_easy/core/smart_collection.rb +0 -236
  76. data/lib/ae_easy/core/version.rb +0 -6
@@ -1,110 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>
7
- Top Level Namespace
8
-
9
- &mdash; Documentation by YARD 0.9.20
10
-
11
- </title>
12
-
13
- <link rel="stylesheet" href="css/style.css" type="text/css" charset="utf-8" />
14
-
15
- <link rel="stylesheet" href="css/common.css" type="text/css" charset="utf-8" />
16
-
17
- <script type="text/javascript" charset="utf-8">
18
- pathId = "";
19
- relpath = '';
20
- </script>
21
-
22
-
23
- <script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
24
-
25
- <script type="text/javascript" charset="utf-8" src="js/app.js"></script>
26
-
27
-
28
- </head>
29
- <body>
30
- <div class="nav_wrap">
31
- <iframe id="nav" src="class_list.html?1"></iframe>
32
- <div id="resizer"></div>
33
- </div>
34
-
35
- <div id="main" tabindex="-1">
36
- <div id="header">
37
- <div id="menu">
38
-
39
- <a href="_index.html">Index</a> &raquo;
40
-
41
-
42
- <span class="title">Top Level Namespace</span>
43
-
44
- </div>
45
-
46
- <div id="search">
47
-
48
- <a class="full_list_link" id="class_list_link"
49
- href="class_list.html">
50
-
51
- <svg width="24" height="24">
52
- <rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
53
- <rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
54
- <rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
55
- </svg>
56
- </a>
57
-
58
- </div>
59
- <div class="clear"></div>
60
- </div>
61
-
62
- <div id="content"><h1>Top Level Namespace
63
-
64
-
65
-
66
- </h1>
67
- <div class="box_info">
68
-
69
-
70
-
71
-
72
-
73
-
74
-
75
-
76
-
77
-
78
-
79
- </div>
80
-
81
- <h2>Defined Under Namespace</h2>
82
- <p class="children">
83
-
84
-
85
- <strong class="modules">Modules:</strong> <span class='object_link'><a href="AeEasy.html" title="AeEasy (module)">AeEasy</a></span>
86
-
87
-
88
-
89
-
90
- </p>
91
-
92
-
93
-
94
-
95
-
96
-
97
-
98
-
99
-
100
- </div>
101
-
102
- <div id="footer">
103
- Generated on Fri Sep 27 02:01:30 2019 by
104
- <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
105
- 0.9.20 (ruby-2.5.3).
106
- </div>
107
-
108
- </div>
109
- </body>
110
- </html>
@@ -1,27 +0,0 @@
1
- module AeEasy
2
- module Core
3
- # Configuration manager tool useful for global configuration data accross
4
- # the scraping process.
5
- class Config
6
- include AeEasy::Core::Plugin::InitializeHook
7
- include AeEasy::Core::Plugin::ConfigBehavior
8
-
9
- # {AeEasy::Core::Plugin::ConfigBehavior#config_collection_key}
10
- alias :collection_key :config_collection_key
11
- # {AeEasy::Core::Plugin::ConfigBehavior#config_collection}
12
- alias :collection :config_collection
13
-
14
- # Initialize config object
15
- #
16
- # @param [Hash] opts ({}) Configuration options.
17
- #
18
- # @see AeEasy::Core::Plugin::ConfigBehavior#initialize_hook_core_config_behavior
19
- def initialize opts = {}
20
- opts = opts.merge(
21
- config_collection: opts[:collection]
22
- )
23
- initialize_hooks opts
24
- end
25
- end
26
- end
27
- end
@@ -1,8 +0,0 @@
1
- require 'ae_easy/core/exception/outdated_error'
2
-
3
- module AeEasy
4
- module Core
5
- module Exception
6
- end
7
- end
8
- end
@@ -1,9 +0,0 @@
1
- module AeEasy
2
- module Core
3
- module Exception
4
- # Exception that indicates something is outdated error.
5
- class OutdatedError < StandardError
6
- end
7
- end
8
- end
9
- end
@@ -1,8 +0,0 @@
1
- require 'ae_easy/core/helper/cookie'
2
-
3
- module AeEasy
4
- module Core
5
- module Helper
6
- end
7
- end
8
- end
@@ -1,209 +0,0 @@
1
- module AeEasy
2
- module Core
3
- module Helper
4
- # Helper used for lower level cookie management.
5
- class Cookie
6
- class << self
7
- # Parse request cookies on different formats.
8
- #
9
- # @param [String,Hash,Array] cookies Cookies to parse.
10
- # @param [Hash] cookie_hash ({}) External hash to store parsed cookies.
11
- #
12
- # @return [Hash]
13
- #
14
- # @example Parse from string.
15
- # parse_from_request 'aaa=111; bbb=222'
16
- # # => {'aaa' => 111, 'bbb' => 222}
17
- #
18
- # @example Parse from array.
19
- # cookies = [
20
- # 'aaa=111',
21
- # 'bbb=222'
22
- # ]
23
- # parse_from_response cookies
24
- # # => {'aaa' => 111, 'bbb' => 222}
25
- #
26
- # @example Parse with `cookie_hash`.
27
- # cookie_hash = {'ccc' => 333}
28
- # parse_from_request 'aaa=111; bbb=222', cookie_hash
29
- # cookie_hash
30
- # # => {'aaa' => 1, 'bbb' => 2, 'ccc' => 333}
31
- def parse_from_request cookies, cookie_hash = {}
32
- # Retrieve from hash
33
- if cookies.is_a? Hash
34
- cookie_hash.merge! cookies
35
- return cookie_hash
36
- end
37
-
38
- # Extract from string
39
- cookies = cookies.split '; ' if cookies.is_a? String
40
-
41
- # Extract from array
42
- cookies&.each do |raw_cookie|
43
- key, value = raw_cookie.split('=', 2)
44
- cookie_hash[key] = value
45
- end
46
- cookie_hash
47
- end
48
-
49
- # Parse response cookies on different formats.
50
- #
51
- # @param [String,Hash,Array] cookies Cookies to parse.
52
- # @param [Hash] cookie_hash ({}) External hash to store parsed cookies.
53
- #
54
- # @return [Hash]
55
- #
56
- # @example Parse from string
57
- # parse_from_response 'aaa=111; bbb=222'
58
- # # => {'aaa' => 111, 'bbb' => 222}
59
- #
60
- # @example Parse from array.
61
- # cookies = [
62
- # 'aaa=111; Expires=Thu, Jan 01 1970 00:00:00 UTC; path=/',
63
- # 'bbb=222; path=/',
64
- # 'ccc=333; path=/; expires=Wed, Jan 01 3000 00:00:00 UTC'
65
- # ]
66
- # parse_from_response cookies
67
- # # => {'bbb' => 222, 'ccc' => 333}
68
- #
69
- # @example Parse with `cookie_hash`.
70
- # cookie_hash = {'ccc' => 333}
71
- # parse_from_response 'aaa=111; bbb=222', cookie_hash
72
- # cookie_hash
73
- # # => {'aaa' => 111, 'bbb' => 222, 'ccc' => 333}
74
- def parse_from_response cookies, cookie_hash = {}
75
- # Retrieve from hash
76
- if cookies.is_a? Hash
77
- cookie_hash.merge! cookies
78
- return cookie_hash
79
- end
80
- # Retrieve from String
81
- cookies = cookies.split '; ' if cookies.is_a? String
82
-
83
- # Extract from array
84
- info = cookie = expires = key = value = nil
85
- cookies&.each do |raw_cookie|
86
- # Extract cookie data
87
- key_pair = raw_cookie.scan(/(?:;\s+([^\=]+)=([^;]*))/i) || []
88
- cookie = key_pair.inject(Hash.new){|h,i|h[i[0].to_s.downcase] = i[1]; h}
89
- cookie[:key], cookie[:value] = raw_cookie.match(/^\s*(?<key>[^\=]+)\=(?<value>[^;]*)/i)&.captures
90
-
91
- # Check cookie expire
92
- expires = cookie['expires'].nil? ? nil : Time.parse(cookie['expires'])
93
- if !expires.nil? && Time.now > expires
94
- cookie_hash.delete cookie[:key]
95
- next
96
- end
97
-
98
- # Save cookie
99
- cookie_hash[cookie[:key]] = cookie[:value]
100
- end
101
- cookie_hash
102
- end
103
-
104
- # Apply request and response cookies as a hash.
105
- #
106
- # @param [String,Array,Hash] request_cookies Cookies to parse.
107
- # @param [String,Array,Hash] response_cookies Cookies to parse.
108
- #
109
- # @return [Hash]
110
- #
111
- # @example
112
- # request_cookies = 'aaa=111; ddd=444'
113
- # response_cookies = [
114
- # 'aaa=111; Expires=Thu, Jan 01 1970 00:00:00 UTC; path=/',
115
- # 'bbb=222; path=/',
116
- # 'ccc=333; path=/; expires=Wed, Jan 01 3000 00:00:00 UTC'
117
- # ]
118
- # update_as_hash , response_cookies
119
- # # => {'bbb' => 222, 'ccc' => 333, 'ddd' => 444}
120
- def update_as_hash request_cookies, response_cookies
121
- cookie_hash = {}
122
- parse_from_request request_cookies, cookie_hash
123
- parse_from_response response_cookies, cookie_hash
124
- cookie_hash
125
- end
126
-
127
- # Encode cookies as request cookie string.
128
- #
129
- # @param [Hash] cookie_hash Hash with cookies.
130
- #
131
- # @return [String]
132
- #
133
- # @example
134
- # cookie_hash = {
135
- # 'aaa' => 111,
136
- # 'bbb' => 222
137
- # }
138
- # encode_to_header cookie_hash
139
- # # => 'aaa=111; bbb=222'
140
- def encode_to_header cookie_hash
141
- cookie_hash.map{|k,v| "#{k}=#{v}"}.join '; '
142
- end
143
-
144
- # Apply request and response cookies as a string with request format.
145
- #
146
- # @param [String,Array,Hash] request_cookies Cookies to parse.
147
- # @param [String,Array,Hash] response_cookies Cookies to parse.
148
- #
149
- # @return [String]
150
- #
151
- # @example
152
- # request_cookies = 'aaa=111; ddd=444'
153
- # response_cookies = [
154
- # 'aaa=111; Expires=Thu, Jan 01 1970 00:00:00 UTC; path=/',
155
- # 'bbb=222; path=/',
156
- # 'ccc=333; path=/; expires=Wed, Jan 01 3000 00:00:00 UTC'
157
- # ]
158
- # update_as_hash , response_cookies
159
- # # => 'bbb=222; ccc=333; ddd=444'
160
- def update request_cookies, response_cookies
161
- cookie_hash = update_as_hash request_cookies, response_cookies
162
- encode_to_header cookie_hash
163
- end
164
-
165
- # Compare if cookie is included into base cookie
166
- #
167
- # @param [Hash] base_cookie_hash Hash that represent universe.
168
- # @param [Hash] cookie_hash Hash that represents to compare.
169
- #
170
- # @return [Boolean]
171
- #
172
- # @example Check a success match.
173
- # base_cookie_hash = {
174
- # 'aaa' => 111,
175
- # 'bbb' => 222,
176
- # 'ccc' => 333,
177
- # 'ddd' => 444
178
- # }
179
- # cookie_hash = {
180
- # 'bbb' => 222,
181
- # 'ddd' => 444
182
- # }
183
- # include? base_cookie_hash, cookie_hash
184
- # # => true
185
- #
186
- # @example Check with fail match.
187
- # base_cookie_hash = {
188
- # 'aaa' => 111,
189
- # 'bbb' => 222,
190
- # 'ccc' => 333,
191
- # 'ddd' => 444
192
- # }
193
- # cookie_hash = {
194
- # 'bbb' => 555,
195
- # 'ddd' => 444
196
- # }
197
- # include? base_cookie_hash, cookie_hash
198
- # # => false
199
- def include? base_cookie_hash, cookie_hash
200
- cookie_hash.each do |key, value|
201
- return false unless base_cookie_hash.has_key?(key) && base_cookie_hash[key] == value
202
- end
203
- true
204
- end
205
- end
206
- end
207
- end
208
- end
209
- end
@@ -1,45 +0,0 @@
1
- require 'ae_easy/core/mock/fake_db'
2
- require 'ae_easy/core/mock/fake_executor'
3
- require 'ae_easy/core/mock/fake_parser'
4
- require 'ae_easy/core/mock/fake_seeder'
5
- require 'ae_easy/core/mock/fake_finisher'
6
-
7
- module AeEasy
8
- module Core
9
- module Mock
10
- # Generate a context and message queue from a list of exposed methods.
11
- #
12
- # @param [Array] exposed_methods List of exposed methods.
13
- #
14
- # @example
15
- # exposed_methods = [:boo, :bar]
16
- # context, message_queue = AeEasy::Core::Mock.context_vars exposed_methods
17
- # context.boo 1, 2
18
- # context.bar 'A', 'B'
19
- # context.bar '111', '222'
20
- # message_queue
21
- # # => [
22
- # # [:boo, [1, 2]],
23
- # # [:bar, ['A', 'B']],
24
- # # [:bar, ['111', '222']]
25
- # # ]
26
- #
27
- # @return [Array] `[context, message_queue]` being:
28
- # * `context`: Object implementing exposed methods.
29
- # * `[Array] message_queue`: Array to store messages.
30
- def self.context_vars exposed_methods
31
- context = Object.new
32
- metaclass = class << context; self; end
33
- message_queue = [] # Beat reference bug
34
- exposed_methods = exposed_methods
35
- exposed_methods.each do |key|
36
- metaclass.send(:define_method, key) do |*args|
37
- # Record all method calls into message queue for easy access
38
- message_queue << [key, args]
39
- end
40
- end
41
- [context, message_queue]
42
- end
43
- end
44
- end
45
- end
@@ -1,561 +0,0 @@
1
- module AeEasy
2
- module Core
3
- module Mock
4
- # Fake in memory database that emulates `Answersengine` database objects' black box behavior.
5
- class FakeDb
6
- # Page id keys, analog to primary keys.
7
- PAGE_KEYS = ['gid'].freeze
8
- # Output id keys, analog to primary keys.
9
- OUTPUT_KEYS = ['_id', '_collection'].freeze
10
- # Job id keys, analog to primary keys.
11
- JOB_KEYS = ['job_id'].freeze
12
- # Job available status.
13
- JOB_STATUSES = {
14
- active: 'active',
15
- done: 'done',
16
- cancelled: 'cancelled',
17
- paused: 'paused'
18
- }
19
- # Default collection for saved outputs
20
- DEFAULT_COLLECTION = 'default'
21
-
22
- # Generate a smart collection with keys and initial values.
23
- #
24
- # @param [Array] keys Analog to primary keys, combination will be uniq.
25
- # @param [Hash] opts Configuration options (see AeEasy::Core::SmartCollection#initialize).
26
- #
27
- # @return [AeEasy::Core::SmartCollection]
28
- def self.new_collection keys, opts = {}
29
- AeEasy::Core::SmartCollection.new keys, opts
30
- end
31
-
32
- # Generate a fake UUID.
33
- #
34
- # @param seed (nil) Object to use as seed for uuid.
35
- #
36
- # @return [String]
37
- def self.fake_uuid seed = nil
38
- seed ||= (Time.new.to_f + rand)
39
- Digest::SHA1.hexdigest seed.to_s
40
- end
41
-
42
- # Generate a fake UUID based on output fields without `_` prefix.
43
- #
44
- # @param [Hash] data Output data.
45
- #
46
- # @return [String]
47
- def self.output_uuid data
48
- seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
49
- fake_uuid seed
50
- end
51
-
52
- # Build a page with defaults by using FakeDb engine.
53
- #
54
- # @param [Hash] page Page initial values.
55
- # @param [Hash] opts ({}) Configuration options (see #initialize).
56
- #
57
- # @return [Hash]
58
- def self.build_page page, opts = {}
59
- opts = {
60
- allow_page_gid_override: true,
61
- allow_job_id_override: true
62
- }.merge opts
63
- temp_db = AeEasy::Core::Mock::FakeDb.new opts
64
- temp_db.pages << page
65
- temp_db.pages.first
66
- end
67
-
68
- # Build a fake page by using FakeDb engine.
69
- #
70
- # @param [Hash] opts ({}) Configuration options (see #initialize).
71
- # @option opts [String] :url ('https://example.com') Page url.
72
- #
73
- # @return [Hash]
74
- def self.build_fake_page opts = {}
75
- page = {
76
- 'url' => (opts[:url] || 'https://example.com')
77
- }
78
- build_page page, opts
79
- end
80
-
81
- # Clean an URL to remove fragment, lowercase schema and host, and sort
82
- # query string.
83
- #
84
- # @param [String] raw_url URL to clean.
85
- #
86
- # @return [String]
87
- def self.clean_uri raw_url
88
- url = URI.parse(raw_url)
89
- url.hostname = url.hostname.downcase
90
- url.fragment = nil
91
-
92
- # Sort query string keys
93
- unless url.query.nil?
94
- query_string = CGI.parse(url.query)
95
- keys = query_string.keys.sort
96
- data = []
97
- keys.each do |key|
98
- query_string[key].each do |value|
99
- data << "#{URI.encode key}=#{URI.encode value}"
100
- end
101
- end
102
- url.query = data.join('&')
103
- end
104
- url.to_s
105
- end
106
-
107
- # Format headers for gid generation.
108
- # @private
109
- #
110
- # @param [Hash,nil] headers Headers hash.
111
- #
112
- # @return [Hash]
113
- def self.format_headers headers
114
- return {} if headers.nil?
115
- data = {}
116
- headers.each do |key, value|
117
- unless value.is_a? Array
118
- data[key] = value
119
- next
120
- end
121
- data[key] = value.sort
122
- end
123
- data
124
- end
125
-
126
- # Build a job with defaults by using FakeDb engine.
127
- #
128
- # @param [Hash] job Job initial values.
129
- # @param [Hash] opts ({}) Configuration options (see #initialize).
130
- #
131
- # @return [Hash]
132
- def self.build_job job, opts = {}
133
- temp_db = AeEasy::Core::Mock::FakeDb.new opts
134
- temp_db.jobs << job
135
- temp_db.jobs.last
136
- end
137
-
138
- # Build a fake job by using FakeDb engine.
139
- #
140
- # @param [Hash] opts ({}) Configuration options (see #initialize).
141
- # @option opts [String] :scraper_name (nil) Scraper name.
142
- # @option opts [Integer] :job_id (nil) Job id.
143
- # @option opts [String] :status ('done').
144
- #
145
- # @return [Hash]
146
- def self.build_fake_job opts = {}
147
- job = {
148
- 'job_id' => opts[:job_id],
149
- 'scraper_name' => opts[:scraper_name],
150
- 'status' => (opts[:status] || 'done')
151
- }
152
- build_job job, opts
153
- end
154
-
155
- # Return a timestamp
156
- #
157
- # @param [Time] time (nil) Time from which to get time stamp.
158
- #
159
- # @return [String]
160
- def self.time_stamp time = nil
161
- time = Time.new if time.nil?
162
- time.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
163
- end
164
-
165
- # Get current job or create new one from values.
166
- #
167
- # @param [Integer] target_job_id (nil) Job id to ensure existance.
168
- #
169
- # @return [Hash]
170
- def ensure_job target_job_id = nil
171
- target_job_id = job_id if target_job_id.nil?
172
- job = jobs.find{|v|v['job_id'] == target_job_id}
173
- return job unless job.nil?
174
- job = {
175
- 'job_id' => target_job_id,
176
- 'scraper_name' => scraper_name,
177
- }
178
- job['status'] = 'active' unless target_job_id != job_id
179
- jobs << job
180
- jobs.last
181
- end
182
-
183
- # Fake scraper_name.
184
- # @return [String,nil]
185
- def scraper_name
186
- @scraper_name ||= 'my_scraper'
187
- end
188
-
189
- # Set fake scraper_name value.
190
- def scraper_name= value
191
- job = ensure_job
192
- @scraper_name = value
193
- job['scraper_name'] = scraper_name
194
- end
195
-
196
- # Fake job id.
197
- # @return [Integer,nil]
198
- def job_id
199
- @job_id ||= generate_job_id
200
- end
201
-
202
- # Set fake job id value.
203
- def job_id= value
204
- @job_id = value
205
- ensure_job
206
- job_id
207
- end
208
-
209
- # Current fake page gid.
210
- # @return [Integer,nil]
211
- def page_gid
212
- @page_gid ||= self.class.fake_uuid
213
- end
214
-
215
- # Set current fake page gid value.
216
- def page_gid= value
217
- @page_gid = value
218
- end
219
-
220
- # Enable page gid override on page or output insert.
221
- def enable_page_gid_override
222
- @allow_page_gid_override = true
223
- end
224
-
225
- # Disable page gid override on page or output insert.
226
- def disable_page_gid_override
227
- @allow_page_gid_override = false
228
- end
229
-
230
- # Specify whenever page gid overriding by user is allowed on page or
231
- # output insert.
232
- #
233
- # @return [Boolean] `true` when allowed, else `false`.
234
- def allow_page_gid_override?
235
- @allow_page_gid_override ||= false
236
- end
237
-
238
- # Enable job id override on page or output insert.
239
- def enable_job_id_override
240
- @allow_job_id_override = true
241
- end
242
-
243
- # Disable job id override on page or output insert.
244
- def disable_job_id_override
245
- @allow_job_id_override = false
246
- end
247
-
248
- # Specify whenever job id overriding by user is allowed on page or
249
- # output insert.
250
- #
251
- # @return [Boolean] `true` when allowed, else `false`.
252
- def allow_job_id_override?
253
- @allow_job_id_override ||= false
254
- end
255
-
256
- # Initialize fake database.
257
- #
258
- # @param [Hash] opts ({}) Configuration options.
259
- # @option opts [Integer,nil] :job_id Job id default value.
260
- # @option opts [String,nil] :scraper_name Scraper name default value.
261
- # @option opts [String,nil] :page_gid Page gid default value.
262
- # @option opts [Boolean, nil] :allow_page_gid_override (false) Specify
263
- # whenever page gid can be overrided on page or output insert.
264
- # @option opts [Boolean, nil] :allow_job_id_override (false) Specify
265
- # whenever job id can be overrided on page or output insert.
266
- def initialize opts = {}
267
- self.job_id = opts[:job_id]
268
- self.scraper_name = opts[:scraper_name]
269
- self.page_gid = opts[:page_gid]
270
- @allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
271
- @allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
272
- end
273
-
274
- # Generate a fake scraper name.
275
- #
276
- # @return [String]
277
- def generate_scraper_name
278
- Faker::Internet.unique.slug
279
- end
280
-
281
- # Generate a fake job_id.
282
- #
283
- # @return [Integer]
284
- def generate_job_id
285
- jobs.count < 1 ? 1 : (jobs.max{|a,b|a['job_id'] <=> b['job_id']}['job_id'] + 1)
286
- end
287
-
288
- # Get output keys with key generators to emulate saving on db.
289
- # @private
290
- #
291
- # @return [Hash]
292
- def job_defaults
293
- @job_defaults ||= {
294
- 'job_id' => lambda{|job| generate_job_id},
295
- 'scraper_name' => lambda{|job| generate_scraper_name},
296
- 'status' => 'done',
297
- 'created_at' => lambda{|job| Time.now}
298
- }
299
- end
300
-
301
- # Stored job collection
302
- #
303
- # @return [AeEasy::Core::SmartCollection]
304
- def jobs
305
- return @jobs unless @jobs.nil?
306
- collection = self.class.new_collection JOB_KEYS,
307
- defaults: job_defaults
308
- collection.bind_event(:before_defaults) do |collection, raw_item|
309
- AeEasy::Core.deep_stringify_keys raw_item
310
- end
311
- collection.bind_event(:before_insert) do |collection, item, match|
312
- item['job_id'] ||= generate_job_id
313
- item
314
- end
315
- @jobs ||= collection
316
- end
317
-
318
- # Generate a fake UUID based on page data:
319
- # * url
320
- # * method
321
- # * headers
322
- # * fetch_type
323
- # * cookie
324
- # * no_redirect
325
- # * body
326
- # * ua_type
327
- #
328
- # @param [Hash] page_data Page data.
329
- #
330
- # @return [String]
331
- def generate_page_gid page_data
332
- fields = [
333
- 'url',
334
- 'method',
335
- 'headers',
336
- 'fetch_type',
337
- 'cookie',
338
- 'no_redirect',
339
- 'body',
340
- 'ua_type'
341
- ]
342
- data = page_data.select{|k,v|fields.include? k}
343
- data['url'] = self.class.clean_uri data['url']
344
- data['headers'] = self.class.format_headers data['headers']
345
- data['cookie'] = AeEasy::Core::Helper::Cookie.parse_from_request data['cookie'] unless data['cookie'].nil?
346
- seed = data.select{|k,v|fields.include? k}.hash
347
- checksum = self.class.fake_uuid seed
348
- "#{URI.parse(data['url']).hostname}-#{checksum}"
349
- end
350
-
351
- # Get page keys with key generators to emulate saving on db.
352
- # @private
353
- #
354
- # @return [Hash]
355
- def page_defaults
356
- @page_defaults ||= {
357
- 'url' => nil,
358
- 'status' => 'to_fetch',
359
- 'job_id' => lambda{|page| job_id},
360
- 'method' => 'GET',
361
- 'headers' => {},
362
- 'fetch_type' => 'standard',
363
- 'cookie' => nil,
364
- 'no_redirect' => false,
365
- 'body' => nil,
366
- 'ua_type' => 'desktop',
367
- 'no_url_encode' => false,
368
- 'http2' => false,
369
- 'vars' => {}
370
- }
371
- end
372
-
373
- # Stored page collection.
374
- #
375
- # @return [AeEasy::Core::SmartCollection]
376
- #
377
- # @note Page gid will be replaced on insert by an auto generated uuid
378
- # unless page gid overriding is enabled
379
- # (see #allow_page_gid_override?)
380
- def pages
381
- return @pages unless @page.nil?
382
-
383
- collection = self.class.new_collection PAGE_KEYS,
384
- defaults: page_defaults
385
- collection.bind_event(:before_defaults) do |collection, raw_item|
386
- item = AeEasy::Core.deep_stringify_keys raw_item
387
- item.delete 'job_id' unless allow_job_id_override?
388
- item
389
- end
390
- collection.bind_event(:before_insert) do |collection, item, match|
391
- if item['gid'].nil? || !allow_page_gid_override?
392
- item['gid'] = generate_page_gid item
393
- end
394
- item
395
- end
396
- collection.bind_event(:after_insert) do |collection, item|
397
- ensure_job item['job_id']
398
- end
399
- @pages ||= collection
400
- end
401
-
402
- # Generate a fake UUID for outputs.
403
- #
404
- # @param [Hash] data Output data.
405
- #
406
- # @return [String]
407
- def generate_output_id data
408
- # Generate random UUID to match AnswersEngine behavior
409
- self.class.fake_uuid
410
- end
411
-
412
- # Get output keys with key generators to emulate saving on db.
413
- # @private
414
- #
415
- # @return [Hash]
416
- def output_defaults
417
- @output_defaults ||= {
418
- '_collection' => DEFAULT_COLLECTION,
419
- '_job_id' => lambda{|output| job_id},
420
- '_created_at' => lambda{|output| self.class.time_stamp},
421
- '_gid' => lambda{|output| page_gid}
422
- }
423
- end
424
-
425
- # Stored output collection
426
- #
427
- # @return [AeEasy::Core::SmartCollection]
428
- def outputs
429
- return @outputs unless @outputs.nil?
430
- collection = self.class.new_collection OUTPUT_KEYS,
431
- defaults: output_defaults
432
- collection.bind_event(:before_defaults) do |collection, raw_item|
433
- item = AeEasy::Core.deep_stringify_keys raw_item
434
- item.delete '_job_id' unless allow_job_id_override?
435
- item.delete '_gid_id' unless allow_page_gid_override?
436
- item
437
- end
438
- collection.bind_event(:before_insert) do |collection, item, match|
439
- item['_id'] ||= generate_output_id item
440
- item
441
- end
442
- collection.bind_event(:after_insert) do |collection, item|
443
- ensure_job item['_job_id']
444
- end
445
- @outputs ||= collection
446
- end
447
-
448
- # Match data to filters.
449
- # @private
450
- #
451
- # @param data Hash containing data.
452
- # @param filters Filters to apply on match.
453
- #
454
- # @return [Boolean]
455
- #
456
- # @note Missing and `nil` values on `data` will match when `filters`'
457
- # field is `nil`.
458
- def match? data, filters
459
- filters.each do |key, value|
460
- return false if data[key] != value
461
- end
462
- true
463
- end
464
-
465
- # Search items from a collection.
466
- #
467
- # @param [Symbol] collection Allowed values: `:outputs`, `:pages`.
468
- # @param [Hash] filter Filters to query.
469
- # @param [Integer] offset (0) Search results offset.
470
- # @param [Integer,nil] limit (nil) Limit search results count. Set to `nil` for unlimited.
471
- #
472
- # @raise ArgumentError On unknown collection.
473
- #
474
- # @note _Warning:_ It uses table scan to filter and should be used on test suites only.
475
- def query collection, filter, offset = 0, limit = nil
476
- return [] unless limit.nil? || limit > 0
477
-
478
- # Get collection items
479
- items = case collection
480
- when :outputs
481
- outputs
482
- when :pages
483
- pages
484
- when :jobs
485
- jobs
486
- else
487
- raise ArgumentError.new "Unknown collection #{collection}."
488
- end
489
-
490
- # Search items
491
- count = 0
492
- matches = []
493
- items.each do |item|
494
- next unless match? item, filter
495
- count += 1
496
-
497
- # Skip until offset
498
- next unless offset < count
499
- # Break on limit reach
500
- break unless limit.nil? || matches.count < limit
501
- matches << item
502
- end
503
- matches
504
- end
505
-
506
- # Refetch a page.
507
- #
508
- # @param [Integer] job_id Page's job_id to refetch.
509
- # @param [String] gid Page's gid to refetch.
510
- def refetch job_id, gid
511
- page = pages.find_match('gid' => gid, 'job_id' => job_id)
512
- raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
513
- page['status'] = 'to_fetch'
514
- page['freshness'] = self.class.time_stamp
515
- page['to_fetch'] = self.class.time_stamp
516
- page['fetched_from'] = nil
517
- page['fetching_at'] = '2001-01-01T00:00:00Z'
518
- page['fetched_at'] = nil
519
- page['fetching_try_count'] = 0
520
- page['effective_url'] = nil
521
- page['parsing_at'] = nil
522
- page['parsing_failed_at'] = nil
523
- page['parsed_at'] = nil
524
- page['parsing_try_count'] = 0
525
- page['parsing_fail_count'] = 0
526
- page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
527
- page['response_checksum'] = nil
528
- page['response_status'] = nil
529
- page['response_status_code'] = nil
530
- page['response_headers'] = nil
531
- page['response_cookie'] = nil
532
- page['response_proto'] = nil
533
- page['content_type'] = nil
534
- page['content_size'] = 0
535
- page['failed_response_status_code'] = nil
536
- page['failed_response_headers'] = nil
537
- page['failed_response_cookie'] = nil
538
- page['failed_effective_url'] = nil
539
- page['failed_at'] = nil
540
- page['failed_content_type'] = nil
541
- end
542
-
543
- # Reparse a page.
544
- #
545
- # @param [Integer] job_id Page's job_id to reparse.
546
- # @param [String] gid Page's gid to reparse.
547
- def reparse job_id, gid
548
- page = pages.find_match('gid' => gid, 'job_id' => job_id)
549
- raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
550
- page['status'] = 'to_parse'
551
- page['parsing_at'] = nil
552
- page['parsing_failed_at'] = nil
553
- page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
554
- page['parsed_at'] = nil
555
- page['parsing_try_count'] = 0
556
- page['parsing_fail_count'] = 0
557
- end
558
- end
559
- end
560
- end
561
- end