ae_easy-core 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CODE_OF_CONDUCT.md +1 -1
- data/Gemfile +1 -1
- data/LICENSE +1 -1
- data/README.md +8 -4
- data/Rakefile +0 -10
- data/ae_easy-core.gemspec +6 -13
- data/lib/ae_easy/core.rb +4 -256
- metadata +18 -125
- data/doc/AeEasy.html +0 -117
- data/doc/AeEasy/Core.html +0 -1590
- data/doc/AeEasy/Core/Config.html +0 -311
- data/doc/AeEasy/Core/Exception.html +0 -117
- data/doc/AeEasy/Core/Exception/OutdatedError.html +0 -135
- data/doc/AeEasy/Core/Helper.html +0 -117
- data/doc/AeEasy/Core/Helper/Cookie.html +0 -1070
- data/doc/AeEasy/Core/Mock.html +0 -282
- data/doc/AeEasy/Core/Mock/FakeDb.html +0 -3779
- data/doc/AeEasy/Core/Mock/FakeExecutor.html +0 -3289
- data/doc/AeEasy/Core/Mock/FakeFinisher.html +0 -160
- data/doc/AeEasy/Core/Mock/FakeParser.html +0 -160
- data/doc/AeEasy/Core/Mock/FakeSeeder.html +0 -160
- data/doc/AeEasy/Core/Plugin.html +0 -117
- data/doc/AeEasy/Core/Plugin/CollectionVault.html +0 -299
- data/doc/AeEasy/Core/Plugin/ConfigBehavior.html +0 -541
- data/doc/AeEasy/Core/Plugin/ContextIntegrator.html +0 -445
- data/doc/AeEasy/Core/Plugin/Executor.html +0 -259
- data/doc/AeEasy/Core/Plugin/ExecutorBehavior.html +0 -344
- data/doc/AeEasy/Core/Plugin/Finisher.html +0 -265
- data/doc/AeEasy/Core/Plugin/FinisherBehavior.html +0 -142
- data/doc/AeEasy/Core/Plugin/InitializeHook.html +0 -220
- data/doc/AeEasy/Core/Plugin/Parser.html +0 -270
- data/doc/AeEasy/Core/Plugin/ParserBehavior.html +0 -235
- data/doc/AeEasy/Core/Plugin/Seeder.html +0 -674
- data/doc/AeEasy/Core/Plugin/SeederBehavior.html +0 -142
- data/doc/AeEasy/Core/SmartCollection.html +0 -1087
- data/doc/_index.html +0 -364
- data/doc/class_list.html +0 -51
- data/doc/css/common.css +0 -1
- data/doc/css/full_list.css +0 -58
- data/doc/css/style.css +0 -496
- data/doc/file.README.html +0 -91
- data/doc/file_list.html +0 -56
- data/doc/frames.html +0 -17
- data/doc/index.html +0 -91
- data/doc/js/app.js +0 -303
- data/doc/js/full_list.js +0 -216
- data/doc/js/jquery.js +0 -4
- data/doc/method_list.html +0 -939
- data/doc/top-level-namespace.html +0 -110
- data/lib/ae_easy/core/config.rb +0 -27
- data/lib/ae_easy/core/exception.rb +0 -8
- data/lib/ae_easy/core/exception/outdated_error.rb +0 -9
- data/lib/ae_easy/core/helper.rb +0 -8
- data/lib/ae_easy/core/helper/cookie.rb +0 -209
- data/lib/ae_easy/core/mock.rb +0 -45
- data/lib/ae_easy/core/mock/fake_db.rb +0 -561
- data/lib/ae_easy/core/mock/fake_executor.rb +0 -373
- data/lib/ae_easy/core/mock/fake_finisher.rb +0 -28
- data/lib/ae_easy/core/mock/fake_parser.rb +0 -33
- data/lib/ae_easy/core/mock/fake_seeder.rb +0 -28
- data/lib/ae_easy/core/plugin.rb +0 -19
- data/lib/ae_easy/core/plugin/collection_vault.rb +0 -23
- data/lib/ae_easy/core/plugin/config_behavior.rb +0 -43
- data/lib/ae_easy/core/plugin/context_integrator.rb +0 -60
- data/lib/ae_easy/core/plugin/executor.rb +0 -19
- data/lib/ae_easy/core/plugin/executor_behavior.rb +0 -32
- data/lib/ae_easy/core/plugin/finisher.rb +0 -19
- data/lib/ae_easy/core/plugin/finisher_behavior.rb +0 -9
- data/lib/ae_easy/core/plugin/initialize_hook.rb +0 -17
- data/lib/ae_easy/core/plugin/parser.rb +0 -19
- data/lib/ae_easy/core/plugin/parser_behavior.rb +0 -17
- data/lib/ae_easy/core/plugin/seeder.rb +0 -44
- data/lib/ae_easy/core/plugin/seeder_behavior.rb +0 -9
- data/lib/ae_easy/core/smart_collection.rb +0 -236
- data/lib/ae_easy/core/version.rb +0 -6
@@ -1,110 +0,0 @@
|
|
1
|
-
<!DOCTYPE html>
|
2
|
-
<html>
|
3
|
-
<head>
|
4
|
-
<meta charset="utf-8">
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6
|
-
<title>
|
7
|
-
Top Level Namespace
|
8
|
-
|
9
|
-
— Documentation by YARD 0.9.20
|
10
|
-
|
11
|
-
</title>
|
12
|
-
|
13
|
-
<link rel="stylesheet" href="css/style.css" type="text/css" charset="utf-8" />
|
14
|
-
|
15
|
-
<link rel="stylesheet" href="css/common.css" type="text/css" charset="utf-8" />
|
16
|
-
|
17
|
-
<script type="text/javascript" charset="utf-8">
|
18
|
-
pathId = "";
|
19
|
-
relpath = '';
|
20
|
-
</script>
|
21
|
-
|
22
|
-
|
23
|
-
<script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
|
24
|
-
|
25
|
-
<script type="text/javascript" charset="utf-8" src="js/app.js"></script>
|
26
|
-
|
27
|
-
|
28
|
-
</head>
|
29
|
-
<body>
|
30
|
-
<div class="nav_wrap">
|
31
|
-
<iframe id="nav" src="class_list.html?1"></iframe>
|
32
|
-
<div id="resizer"></div>
|
33
|
-
</div>
|
34
|
-
|
35
|
-
<div id="main" tabindex="-1">
|
36
|
-
<div id="header">
|
37
|
-
<div id="menu">
|
38
|
-
|
39
|
-
<a href="_index.html">Index</a> »
|
40
|
-
|
41
|
-
|
42
|
-
<span class="title">Top Level Namespace</span>
|
43
|
-
|
44
|
-
</div>
|
45
|
-
|
46
|
-
<div id="search">
|
47
|
-
|
48
|
-
<a class="full_list_link" id="class_list_link"
|
49
|
-
href="class_list.html">
|
50
|
-
|
51
|
-
<svg width="24" height="24">
|
52
|
-
<rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
|
53
|
-
<rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
|
54
|
-
<rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
|
55
|
-
</svg>
|
56
|
-
</a>
|
57
|
-
|
58
|
-
</div>
|
59
|
-
<div class="clear"></div>
|
60
|
-
</div>
|
61
|
-
|
62
|
-
<div id="content"><h1>Top Level Namespace
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
</h1>
|
67
|
-
<div class="box_info">
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
</div>
|
80
|
-
|
81
|
-
<h2>Defined Under Namespace</h2>
|
82
|
-
<p class="children">
|
83
|
-
|
84
|
-
|
85
|
-
<strong class="modules">Modules:</strong> <span class='object_link'><a href="AeEasy.html" title="AeEasy (module)">AeEasy</a></span>
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
</p>
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
</div>
|
101
|
-
|
102
|
-
<div id="footer">
|
103
|
-
Generated on Fri Sep 27 02:01:30 2019 by
|
104
|
-
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
105
|
-
0.9.20 (ruby-2.5.3).
|
106
|
-
</div>
|
107
|
-
|
108
|
-
</div>
|
109
|
-
</body>
|
110
|
-
</html>
|
data/lib/ae_easy/core/config.rb
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
module AeEasy
|
2
|
-
module Core
|
3
|
-
# Configuration manager tool useful for global configuration data accross
|
4
|
-
# the scraping process.
|
5
|
-
class Config
|
6
|
-
include AeEasy::Core::Plugin::InitializeHook
|
7
|
-
include AeEasy::Core::Plugin::ConfigBehavior
|
8
|
-
|
9
|
-
# {AeEasy::Core::Plugin::ConfigBehavior#config_collection_key}
|
10
|
-
alias :collection_key :config_collection_key
|
11
|
-
# {AeEasy::Core::Plugin::ConfigBehavior#config_collection}
|
12
|
-
alias :collection :config_collection
|
13
|
-
|
14
|
-
# Initialize config object
|
15
|
-
#
|
16
|
-
# @param [Hash] opts ({}) Configuration options.
|
17
|
-
#
|
18
|
-
# @see AeEasy::Core::Plugin::ConfigBehavior#initialize_hook_core_config_behavior
|
19
|
-
def initialize opts = {}
|
20
|
-
opts = opts.merge(
|
21
|
-
config_collection: opts[:collection]
|
22
|
-
)
|
23
|
-
initialize_hooks opts
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
data/lib/ae_easy/core/helper.rb
DELETED
@@ -1,209 +0,0 @@
|
|
1
|
-
module AeEasy
|
2
|
-
module Core
|
3
|
-
module Helper
|
4
|
-
# Helper used for lower level cookie management.
|
5
|
-
class Cookie
|
6
|
-
class << self
|
7
|
-
# Parse request cookies on different formats.
|
8
|
-
#
|
9
|
-
# @param [String,Hash,Array] cookies Cookies to parse.
|
10
|
-
# @param [Hash] cookie_hash ({}) External hash to store parsed cookies.
|
11
|
-
#
|
12
|
-
# @return [Hash]
|
13
|
-
#
|
14
|
-
# @example Parse from string.
|
15
|
-
# parse_from_request 'aaa=111; bbb=222'
|
16
|
-
# # => {'aaa' => 111, 'bbb' => 222}
|
17
|
-
#
|
18
|
-
# @example Parse from array.
|
19
|
-
# cookies = [
|
20
|
-
# 'aaa=111',
|
21
|
-
# 'bbb=222'
|
22
|
-
# ]
|
23
|
-
# parse_from_response cookies
|
24
|
-
# # => {'aaa' => 111, 'bbb' => 222}
|
25
|
-
#
|
26
|
-
# @example Parse with `cookie_hash`.
|
27
|
-
# cookie_hash = {'ccc' => 333}
|
28
|
-
# parse_from_request 'aaa=111; bbb=222', cookie_hash
|
29
|
-
# cookie_hash
|
30
|
-
# # => {'aaa' => 1, 'bbb' => 2, 'ccc' => 333}
|
31
|
-
def parse_from_request cookies, cookie_hash = {}
|
32
|
-
# Retrieve from hash
|
33
|
-
if cookies.is_a? Hash
|
34
|
-
cookie_hash.merge! cookies
|
35
|
-
return cookie_hash
|
36
|
-
end
|
37
|
-
|
38
|
-
# Extract from string
|
39
|
-
cookies = cookies.split '; ' if cookies.is_a? String
|
40
|
-
|
41
|
-
# Extract from array
|
42
|
-
cookies&.each do |raw_cookie|
|
43
|
-
key, value = raw_cookie.split('=', 2)
|
44
|
-
cookie_hash[key] = value
|
45
|
-
end
|
46
|
-
cookie_hash
|
47
|
-
end
|
48
|
-
|
49
|
-
# Parse response cookies on different formats.
|
50
|
-
#
|
51
|
-
# @param [String,Hash,Array] cookies Cookies to parse.
|
52
|
-
# @param [Hash] cookie_hash ({}) External hash to store parsed cookies.
|
53
|
-
#
|
54
|
-
# @return [Hash]
|
55
|
-
#
|
56
|
-
# @example Parse from string
|
57
|
-
# parse_from_response 'aaa=111; bbb=222'
|
58
|
-
# # => {'aaa' => 111, 'bbb' => 222}
|
59
|
-
#
|
60
|
-
# @example Parse from array.
|
61
|
-
# cookies = [
|
62
|
-
# 'aaa=111; Expires=Thu, Jan 01 1970 00:00:00 UTC; path=/',
|
63
|
-
# 'bbb=222; path=/',
|
64
|
-
# 'ccc=333; path=/; expires=Wed, Jan 01 3000 00:00:00 UTC'
|
65
|
-
# ]
|
66
|
-
# parse_from_response cookies
|
67
|
-
# # => {'bbb' => 222, 'ccc' => 333}
|
68
|
-
#
|
69
|
-
# @example Parse with `cookie_hash`.
|
70
|
-
# cookie_hash = {'ccc' => 333}
|
71
|
-
# parse_from_response 'aaa=111; bbb=222', cookie_hash
|
72
|
-
# cookie_hash
|
73
|
-
# # => {'aaa' => 111, 'bbb' => 222, 'ccc' => 333}
|
74
|
-
def parse_from_response cookies, cookie_hash = {}
|
75
|
-
# Retrieve from hash
|
76
|
-
if cookies.is_a? Hash
|
77
|
-
cookie_hash.merge! cookies
|
78
|
-
return cookie_hash
|
79
|
-
end
|
80
|
-
# Retrieve from String
|
81
|
-
cookies = cookies.split '; ' if cookies.is_a? String
|
82
|
-
|
83
|
-
# Extract from array
|
84
|
-
info = cookie = expires = key = value = nil
|
85
|
-
cookies&.each do |raw_cookie|
|
86
|
-
# Extract cookie data
|
87
|
-
key_pair = raw_cookie.scan(/(?:;\s+([^\=]+)=([^;]*))/i) || []
|
88
|
-
cookie = key_pair.inject(Hash.new){|h,i|h[i[0].to_s.downcase] = i[1]; h}
|
89
|
-
cookie[:key], cookie[:value] = raw_cookie.match(/^\s*(?<key>[^\=]+)\=(?<value>[^;]*)/i)&.captures
|
90
|
-
|
91
|
-
# Check cookie expire
|
92
|
-
expires = cookie['expires'].nil? ? nil : Time.parse(cookie['expires'])
|
93
|
-
if !expires.nil? && Time.now > expires
|
94
|
-
cookie_hash.delete cookie[:key]
|
95
|
-
next
|
96
|
-
end
|
97
|
-
|
98
|
-
# Save cookie
|
99
|
-
cookie_hash[cookie[:key]] = cookie[:value]
|
100
|
-
end
|
101
|
-
cookie_hash
|
102
|
-
end
|
103
|
-
|
104
|
-
# Apply request and response cookies as a hash.
|
105
|
-
#
|
106
|
-
# @param [String,Array,Hash] request_cookies Cookies to parse.
|
107
|
-
# @param [String,Array,Hash] response_cookies Cookies to parse.
|
108
|
-
#
|
109
|
-
# @return [Hash]
|
110
|
-
#
|
111
|
-
# @example
|
112
|
-
# request_cookies = 'aaa=111; ddd=444'
|
113
|
-
# response_cookies = [
|
114
|
-
# 'aaa=111; Expires=Thu, Jan 01 1970 00:00:00 UTC; path=/',
|
115
|
-
# 'bbb=222; path=/',
|
116
|
-
# 'ccc=333; path=/; expires=Wed, Jan 01 3000 00:00:00 UTC'
|
117
|
-
# ]
|
118
|
-
# update_as_hash , response_cookies
|
119
|
-
# # => {'bbb' => 222, 'ccc' => 333, 'ddd' => 444}
|
120
|
-
def update_as_hash request_cookies, response_cookies
|
121
|
-
cookie_hash = {}
|
122
|
-
parse_from_request request_cookies, cookie_hash
|
123
|
-
parse_from_response response_cookies, cookie_hash
|
124
|
-
cookie_hash
|
125
|
-
end
|
126
|
-
|
127
|
-
# Encode cookies as request cookie string.
|
128
|
-
#
|
129
|
-
# @param [Hash] cookie_hash Hash with cookies.
|
130
|
-
#
|
131
|
-
# @return [String]
|
132
|
-
#
|
133
|
-
# @example
|
134
|
-
# cookie_hash = {
|
135
|
-
# 'aaa' => 111,
|
136
|
-
# 'bbb' => 222
|
137
|
-
# }
|
138
|
-
# encode_to_header cookie_hash
|
139
|
-
# # => 'aaa=111; bbb=222'
|
140
|
-
def encode_to_header cookie_hash
|
141
|
-
cookie_hash.map{|k,v| "#{k}=#{v}"}.join '; '
|
142
|
-
end
|
143
|
-
|
144
|
-
# Apply request and response cookies as a string with request format.
|
145
|
-
#
|
146
|
-
# @param [String,Array,Hash] request_cookies Cookies to parse.
|
147
|
-
# @param [String,Array,Hash] response_cookies Cookies to parse.
|
148
|
-
#
|
149
|
-
# @return [String]
|
150
|
-
#
|
151
|
-
# @example
|
152
|
-
# request_cookies = 'aaa=111; ddd=444'
|
153
|
-
# response_cookies = [
|
154
|
-
# 'aaa=111; Expires=Thu, Jan 01 1970 00:00:00 UTC; path=/',
|
155
|
-
# 'bbb=222; path=/',
|
156
|
-
# 'ccc=333; path=/; expires=Wed, Jan 01 3000 00:00:00 UTC'
|
157
|
-
# ]
|
158
|
-
# update_as_hash , response_cookies
|
159
|
-
# # => 'bbb=222; ccc=333; ddd=444'
|
160
|
-
def update request_cookies, response_cookies
|
161
|
-
cookie_hash = update_as_hash request_cookies, response_cookies
|
162
|
-
encode_to_header cookie_hash
|
163
|
-
end
|
164
|
-
|
165
|
-
# Compare if cookie is included into base cookie
|
166
|
-
#
|
167
|
-
# @param [Hash] base_cookie_hash Hash that represent universe.
|
168
|
-
# @param [Hash] cookie_hash Hash that represents to compare.
|
169
|
-
#
|
170
|
-
# @return [Boolean]
|
171
|
-
#
|
172
|
-
# @example Check a success match.
|
173
|
-
# base_cookie_hash = {
|
174
|
-
# 'aaa' => 111,
|
175
|
-
# 'bbb' => 222,
|
176
|
-
# 'ccc' => 333,
|
177
|
-
# 'ddd' => 444
|
178
|
-
# }
|
179
|
-
# cookie_hash = {
|
180
|
-
# 'bbb' => 222,
|
181
|
-
# 'ddd' => 444
|
182
|
-
# }
|
183
|
-
# include? base_cookie_hash, cookie_hash
|
184
|
-
# # => true
|
185
|
-
#
|
186
|
-
# @example Check with fail match.
|
187
|
-
# base_cookie_hash = {
|
188
|
-
# 'aaa' => 111,
|
189
|
-
# 'bbb' => 222,
|
190
|
-
# 'ccc' => 333,
|
191
|
-
# 'ddd' => 444
|
192
|
-
# }
|
193
|
-
# cookie_hash = {
|
194
|
-
# 'bbb' => 555,
|
195
|
-
# 'ddd' => 444
|
196
|
-
# }
|
197
|
-
# include? base_cookie_hash, cookie_hash
|
198
|
-
# # => false
|
199
|
-
def include? base_cookie_hash, cookie_hash
|
200
|
-
cookie_hash.each do |key, value|
|
201
|
-
return false unless base_cookie_hash.has_key?(key) && base_cookie_hash[key] == value
|
202
|
-
end
|
203
|
-
true
|
204
|
-
end
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
208
|
-
end
|
209
|
-
end
|
data/lib/ae_easy/core/mock.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
require 'ae_easy/core/mock/fake_db'
|
2
|
-
require 'ae_easy/core/mock/fake_executor'
|
3
|
-
require 'ae_easy/core/mock/fake_parser'
|
4
|
-
require 'ae_easy/core/mock/fake_seeder'
|
5
|
-
require 'ae_easy/core/mock/fake_finisher'
|
6
|
-
|
7
|
-
module AeEasy
|
8
|
-
module Core
|
9
|
-
module Mock
|
10
|
-
# Generate a context and message queue from a list of exposed methods.
|
11
|
-
#
|
12
|
-
# @param [Array] exposed_methods List of exposed methods.
|
13
|
-
#
|
14
|
-
# @example
|
15
|
-
# exposed_methods = [:boo, :bar]
|
16
|
-
# context, message_queue = AeEasy::Core::Mock.context_vars exposed_methods
|
17
|
-
# context.boo 1, 2
|
18
|
-
# context.bar 'A', 'B'
|
19
|
-
# context.bar '111', '222'
|
20
|
-
# message_queue
|
21
|
-
# # => [
|
22
|
-
# # [:boo, [1, 2]],
|
23
|
-
# # [:bar, ['A', 'B']],
|
24
|
-
# # [:bar, ['111', '222']]
|
25
|
-
# # ]
|
26
|
-
#
|
27
|
-
# @return [Array] `[context, message_queue]` being:
|
28
|
-
# * `context`: Object implementing exposed methods.
|
29
|
-
# * `[Array] message_queue`: Array to store messages.
|
30
|
-
def self.context_vars exposed_methods
|
31
|
-
context = Object.new
|
32
|
-
metaclass = class << context; self; end
|
33
|
-
message_queue = [] # Beat reference bug
|
34
|
-
exposed_methods = exposed_methods
|
35
|
-
exposed_methods.each do |key|
|
36
|
-
metaclass.send(:define_method, key) do |*args|
|
37
|
-
# Record all method calls into message queue for easy access
|
38
|
-
message_queue << [key, args]
|
39
|
-
end
|
40
|
-
end
|
41
|
-
[context, message_queue]
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
@@ -1,561 +0,0 @@
|
|
1
|
-
module AeEasy
|
2
|
-
module Core
|
3
|
-
module Mock
|
4
|
-
# Fake in memory database that emulates `Answersengine` database objects' black box behavior.
|
5
|
-
class FakeDb
|
6
|
-
# Page id keys, analog to primary keys.
|
7
|
-
PAGE_KEYS = ['gid'].freeze
|
8
|
-
# Output id keys, analog to primary keys.
|
9
|
-
OUTPUT_KEYS = ['_id', '_collection'].freeze
|
10
|
-
# Job id keys, analog to primary keys.
|
11
|
-
JOB_KEYS = ['job_id'].freeze
|
12
|
-
# Job available status.
|
13
|
-
JOB_STATUSES = {
|
14
|
-
active: 'active',
|
15
|
-
done: 'done',
|
16
|
-
cancelled: 'cancelled',
|
17
|
-
paused: 'paused'
|
18
|
-
}
|
19
|
-
# Default collection for saved outputs
|
20
|
-
DEFAULT_COLLECTION = 'default'
|
21
|
-
|
22
|
-
# Generate a smart collection with keys and initial values.
|
23
|
-
#
|
24
|
-
# @param [Array] keys Analog to primary keys, combination will be uniq.
|
25
|
-
# @param [Hash] opts Configuration options (see AeEasy::Core::SmartCollection#initialize).
|
26
|
-
#
|
27
|
-
# @return [AeEasy::Core::SmartCollection]
|
28
|
-
def self.new_collection keys, opts = {}
|
29
|
-
AeEasy::Core::SmartCollection.new keys, opts
|
30
|
-
end
|
31
|
-
|
32
|
-
# Generate a fake UUID.
|
33
|
-
#
|
34
|
-
# @param seed (nil) Object to use as seed for uuid.
|
35
|
-
#
|
36
|
-
# @return [String]
|
37
|
-
def self.fake_uuid seed = nil
|
38
|
-
seed ||= (Time.new.to_f + rand)
|
39
|
-
Digest::SHA1.hexdigest seed.to_s
|
40
|
-
end
|
41
|
-
|
42
|
-
# Generate a fake UUID based on output fields without `_` prefix.
|
43
|
-
#
|
44
|
-
# @param [Hash] data Output data.
|
45
|
-
#
|
46
|
-
# @return [String]
|
47
|
-
def self.output_uuid data
|
48
|
-
seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
|
49
|
-
fake_uuid seed
|
50
|
-
end
|
51
|
-
|
52
|
-
# Build a page with defaults by using FakeDb engine.
|
53
|
-
#
|
54
|
-
# @param [Hash] page Page initial values.
|
55
|
-
# @param [Hash] opts ({}) Configuration options (see #initialize).
|
56
|
-
#
|
57
|
-
# @return [Hash]
|
58
|
-
def self.build_page page, opts = {}
|
59
|
-
opts = {
|
60
|
-
allow_page_gid_override: true,
|
61
|
-
allow_job_id_override: true
|
62
|
-
}.merge opts
|
63
|
-
temp_db = AeEasy::Core::Mock::FakeDb.new opts
|
64
|
-
temp_db.pages << page
|
65
|
-
temp_db.pages.first
|
66
|
-
end
|
67
|
-
|
68
|
-
# Build a fake page by using FakeDb engine.
|
69
|
-
#
|
70
|
-
# @param [Hash] opts ({}) Configuration options (see #initialize).
|
71
|
-
# @option opts [String] :url ('https://example.com') Page url.
|
72
|
-
#
|
73
|
-
# @return [Hash]
|
74
|
-
def self.build_fake_page opts = {}
|
75
|
-
page = {
|
76
|
-
'url' => (opts[:url] || 'https://example.com')
|
77
|
-
}
|
78
|
-
build_page page, opts
|
79
|
-
end
|
80
|
-
|
81
|
-
# Clean an URL to remove fragment, lowercase schema and host, and sort
|
82
|
-
# query string.
|
83
|
-
#
|
84
|
-
# @param [String] raw_url URL to clean.
|
85
|
-
#
|
86
|
-
# @return [String]
|
87
|
-
def self.clean_uri raw_url
|
88
|
-
url = URI.parse(raw_url)
|
89
|
-
url.hostname = url.hostname.downcase
|
90
|
-
url.fragment = nil
|
91
|
-
|
92
|
-
# Sort query string keys
|
93
|
-
unless url.query.nil?
|
94
|
-
query_string = CGI.parse(url.query)
|
95
|
-
keys = query_string.keys.sort
|
96
|
-
data = []
|
97
|
-
keys.each do |key|
|
98
|
-
query_string[key].each do |value|
|
99
|
-
data << "#{URI.encode key}=#{URI.encode value}"
|
100
|
-
end
|
101
|
-
end
|
102
|
-
url.query = data.join('&')
|
103
|
-
end
|
104
|
-
url.to_s
|
105
|
-
end
|
106
|
-
|
107
|
-
# Format headers for gid generation.
|
108
|
-
# @private
|
109
|
-
#
|
110
|
-
# @param [Hash,nil] headers Headers hash.
|
111
|
-
#
|
112
|
-
# @return [Hash]
|
113
|
-
def self.format_headers headers
|
114
|
-
return {} if headers.nil?
|
115
|
-
data = {}
|
116
|
-
headers.each do |key, value|
|
117
|
-
unless value.is_a? Array
|
118
|
-
data[key] = value
|
119
|
-
next
|
120
|
-
end
|
121
|
-
data[key] = value.sort
|
122
|
-
end
|
123
|
-
data
|
124
|
-
end
|
125
|
-
|
126
|
-
# Build a job with defaults by using FakeDb engine.
|
127
|
-
#
|
128
|
-
# @param [Hash] job Job initial values.
|
129
|
-
# @param [Hash] opts ({}) Configuration options (see #initialize).
|
130
|
-
#
|
131
|
-
# @return [Hash]
|
132
|
-
def self.build_job job, opts = {}
|
133
|
-
temp_db = AeEasy::Core::Mock::FakeDb.new opts
|
134
|
-
temp_db.jobs << job
|
135
|
-
temp_db.jobs.last
|
136
|
-
end
|
137
|
-
|
138
|
-
# Build a fake job by using FakeDb engine.
|
139
|
-
#
|
140
|
-
# @param [Hash] opts ({}) Configuration options (see #initialize).
|
141
|
-
# @option opts [String] :scraper_name (nil) Scraper name.
|
142
|
-
# @option opts [Integer] :job_id (nil) Job id.
|
143
|
-
# @option opts [String] :status ('done').
|
144
|
-
#
|
145
|
-
# @return [Hash]
|
146
|
-
def self.build_fake_job opts = {}
|
147
|
-
job = {
|
148
|
-
'job_id' => opts[:job_id],
|
149
|
-
'scraper_name' => opts[:scraper_name],
|
150
|
-
'status' => (opts[:status] || 'done')
|
151
|
-
}
|
152
|
-
build_job job, opts
|
153
|
-
end
|
154
|
-
|
155
|
-
# Return a timestamp
|
156
|
-
#
|
157
|
-
# @param [Time] time (nil) Time from which to get time stamp.
|
158
|
-
#
|
159
|
-
# @return [String]
|
160
|
-
def self.time_stamp time = nil
|
161
|
-
time = Time.new if time.nil?
|
162
|
-
time.utc.strftime('%Y-%m-%dT%H:%M:%SZ')
|
163
|
-
end
|
164
|
-
|
165
|
-
# Get current job or create new one from values.
|
166
|
-
#
|
167
|
-
# @param [Integer] target_job_id (nil) Job id to ensure existance.
|
168
|
-
#
|
169
|
-
# @return [Hash]
|
170
|
-
def ensure_job target_job_id = nil
|
171
|
-
target_job_id = job_id if target_job_id.nil?
|
172
|
-
job = jobs.find{|v|v['job_id'] == target_job_id}
|
173
|
-
return job unless job.nil?
|
174
|
-
job = {
|
175
|
-
'job_id' => target_job_id,
|
176
|
-
'scraper_name' => scraper_name,
|
177
|
-
}
|
178
|
-
job['status'] = 'active' unless target_job_id != job_id
|
179
|
-
jobs << job
|
180
|
-
jobs.last
|
181
|
-
end
|
182
|
-
|
183
|
-
# Fake scraper_name.
|
184
|
-
# @return [String,nil]
|
185
|
-
def scraper_name
|
186
|
-
@scraper_name ||= 'my_scraper'
|
187
|
-
end
|
188
|
-
|
189
|
-
# Set fake scraper_name value.
|
190
|
-
def scraper_name= value
|
191
|
-
job = ensure_job
|
192
|
-
@scraper_name = value
|
193
|
-
job['scraper_name'] = scraper_name
|
194
|
-
end
|
195
|
-
|
196
|
-
# Fake job id.
|
197
|
-
# @return [Integer,nil]
|
198
|
-
def job_id
|
199
|
-
@job_id ||= generate_job_id
|
200
|
-
end
|
201
|
-
|
202
|
-
# Set fake job id value.
|
203
|
-
def job_id= value
|
204
|
-
@job_id = value
|
205
|
-
ensure_job
|
206
|
-
job_id
|
207
|
-
end
|
208
|
-
|
209
|
-
# Current fake page gid.
|
210
|
-
# @return [Integer,nil]
|
211
|
-
def page_gid
|
212
|
-
@page_gid ||= self.class.fake_uuid
|
213
|
-
end
|
214
|
-
|
215
|
-
# Set current fake page gid value.
|
216
|
-
def page_gid= value
|
217
|
-
@page_gid = value
|
218
|
-
end
|
219
|
-
|
220
|
-
# Enable page gid override on page or output insert.
|
221
|
-
def enable_page_gid_override
|
222
|
-
@allow_page_gid_override = true
|
223
|
-
end
|
224
|
-
|
225
|
-
# Disable page gid override on page or output insert.
|
226
|
-
def disable_page_gid_override
|
227
|
-
@allow_page_gid_override = false
|
228
|
-
end
|
229
|
-
|
230
|
-
# Specify whenever page gid overriding by user is allowed on page or
|
231
|
-
# output insert.
|
232
|
-
#
|
233
|
-
# @return [Boolean] `true` when allowed, else `false`.
|
234
|
-
def allow_page_gid_override?
|
235
|
-
@allow_page_gid_override ||= false
|
236
|
-
end
|
237
|
-
|
238
|
-
# Enable job id override on page or output insert.
|
239
|
-
def enable_job_id_override
|
240
|
-
@allow_job_id_override = true
|
241
|
-
end
|
242
|
-
|
243
|
-
# Disable job id override on page or output insert.
|
244
|
-
def disable_job_id_override
|
245
|
-
@allow_job_id_override = false
|
246
|
-
end
|
247
|
-
|
248
|
-
# Specify whenever job id overriding by user is allowed on page or
|
249
|
-
# output insert.
|
250
|
-
#
|
251
|
-
# @return [Boolean] `true` when allowed, else `false`.
|
252
|
-
def allow_job_id_override?
|
253
|
-
@allow_job_id_override ||= false
|
254
|
-
end
|
255
|
-
|
256
|
-
# Initialize fake database.
|
257
|
-
#
|
258
|
-
# @param [Hash] opts ({}) Configuration options.
|
259
|
-
# @option opts [Integer,nil] :job_id Job id default value.
|
260
|
-
# @option opts [String,nil] :scraper_name Scraper name default value.
|
261
|
-
# @option opts [String,nil] :page_gid Page gid default value.
|
262
|
-
# @option opts [Boolean, nil] :allow_page_gid_override (false) Specify
|
263
|
-
# whenever page gid can be overrided on page or output insert.
|
264
|
-
# @option opts [Boolean, nil] :allow_job_id_override (false) Specify
|
265
|
-
# whenever job id can be overrided on page or output insert.
|
266
|
-
def initialize opts = {}
|
267
|
-
self.job_id = opts[:job_id]
|
268
|
-
self.scraper_name = opts[:scraper_name]
|
269
|
-
self.page_gid = opts[:page_gid]
|
270
|
-
@allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
|
271
|
-
@allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
|
272
|
-
end
|
273
|
-
|
274
|
-
# Generate a fake scraper name.
|
275
|
-
#
|
276
|
-
# @return [String]
|
277
|
-
def generate_scraper_name
|
278
|
-
Faker::Internet.unique.slug
|
279
|
-
end
|
280
|
-
|
281
|
-
# Generate a fake job_id.
|
282
|
-
#
|
283
|
-
# @return [Integer]
|
284
|
-
def generate_job_id
|
285
|
-
jobs.count < 1 ? 1 : (jobs.max{|a,b|a['job_id'] <=> b['job_id']}['job_id'] + 1)
|
286
|
-
end
|
287
|
-
|
288
|
-
# Get output keys with key generators to emulate saving on db.
|
289
|
-
# @private
|
290
|
-
#
|
291
|
-
# @return [Hash]
|
292
|
-
def job_defaults
|
293
|
-
@job_defaults ||= {
|
294
|
-
'job_id' => lambda{|job| generate_job_id},
|
295
|
-
'scraper_name' => lambda{|job| generate_scraper_name},
|
296
|
-
'status' => 'done',
|
297
|
-
'created_at' => lambda{|job| Time.now}
|
298
|
-
}
|
299
|
-
end
|
300
|
-
|
301
|
-
# Stored job collection
|
302
|
-
#
|
303
|
-
# @return [AeEasy::Core::SmartCollection]
|
304
|
-
def jobs
|
305
|
-
return @jobs unless @jobs.nil?
|
306
|
-
collection = self.class.new_collection JOB_KEYS,
|
307
|
-
defaults: job_defaults
|
308
|
-
collection.bind_event(:before_defaults) do |collection, raw_item|
|
309
|
-
AeEasy::Core.deep_stringify_keys raw_item
|
310
|
-
end
|
311
|
-
collection.bind_event(:before_insert) do |collection, item, match|
|
312
|
-
item['job_id'] ||= generate_job_id
|
313
|
-
item
|
314
|
-
end
|
315
|
-
@jobs ||= collection
|
316
|
-
end
|
317
|
-
|
318
|
-
# Generate a fake UUID based on page data:
|
319
|
-
# * url
|
320
|
-
# * method
|
321
|
-
# * headers
|
322
|
-
# * fetch_type
|
323
|
-
# * cookie
|
324
|
-
# * no_redirect
|
325
|
-
# * body
|
326
|
-
# * ua_type
|
327
|
-
#
|
328
|
-
# @param [Hash] page_data Page data.
|
329
|
-
#
|
330
|
-
# @return [String]
|
331
|
-
def generate_page_gid page_data
|
332
|
-
fields = [
|
333
|
-
'url',
|
334
|
-
'method',
|
335
|
-
'headers',
|
336
|
-
'fetch_type',
|
337
|
-
'cookie',
|
338
|
-
'no_redirect',
|
339
|
-
'body',
|
340
|
-
'ua_type'
|
341
|
-
]
|
342
|
-
data = page_data.select{|k,v|fields.include? k}
|
343
|
-
data['url'] = self.class.clean_uri data['url']
|
344
|
-
data['headers'] = self.class.format_headers data['headers']
|
345
|
-
data['cookie'] = AeEasy::Core::Helper::Cookie.parse_from_request data['cookie'] unless data['cookie'].nil?
|
346
|
-
seed = data.select{|k,v|fields.include? k}.hash
|
347
|
-
checksum = self.class.fake_uuid seed
|
348
|
-
"#{URI.parse(data['url']).hostname}-#{checksum}"
|
349
|
-
end
|
350
|
-
|
351
|
-
# Get page keys with key generators to emulate saving on db.
|
352
|
-
# @private
|
353
|
-
#
|
354
|
-
# @return [Hash]
|
355
|
-
def page_defaults
|
356
|
-
@page_defaults ||= {
|
357
|
-
'url' => nil,
|
358
|
-
'status' => 'to_fetch',
|
359
|
-
'job_id' => lambda{|page| job_id},
|
360
|
-
'method' => 'GET',
|
361
|
-
'headers' => {},
|
362
|
-
'fetch_type' => 'standard',
|
363
|
-
'cookie' => nil,
|
364
|
-
'no_redirect' => false,
|
365
|
-
'body' => nil,
|
366
|
-
'ua_type' => 'desktop',
|
367
|
-
'no_url_encode' => false,
|
368
|
-
'http2' => false,
|
369
|
-
'vars' => {}
|
370
|
-
}
|
371
|
-
end
|
372
|
-
|
373
|
-
# Stored page collection.
|
374
|
-
#
|
375
|
-
# @return [AeEasy::Core::SmartCollection]
|
376
|
-
#
|
377
|
-
# @note Page gid will be replaced on insert by an auto generated uuid
|
378
|
-
# unless page gid overriding is enabled
|
379
|
-
# (see #allow_page_gid_override?)
|
380
|
-
def pages
|
381
|
-
return @pages unless @page.nil?
|
382
|
-
|
383
|
-
collection = self.class.new_collection PAGE_KEYS,
|
384
|
-
defaults: page_defaults
|
385
|
-
collection.bind_event(:before_defaults) do |collection, raw_item|
|
386
|
-
item = AeEasy::Core.deep_stringify_keys raw_item
|
387
|
-
item.delete 'job_id' unless allow_job_id_override?
|
388
|
-
item
|
389
|
-
end
|
390
|
-
collection.bind_event(:before_insert) do |collection, item, match|
|
391
|
-
if item['gid'].nil? || !allow_page_gid_override?
|
392
|
-
item['gid'] = generate_page_gid item
|
393
|
-
end
|
394
|
-
item
|
395
|
-
end
|
396
|
-
collection.bind_event(:after_insert) do |collection, item|
|
397
|
-
ensure_job item['job_id']
|
398
|
-
end
|
399
|
-
@pages ||= collection
|
400
|
-
end
|
401
|
-
|
402
|
-
# Generate a fake UUID for outputs.
|
403
|
-
#
|
404
|
-
# @param [Hash] data Output data.
|
405
|
-
#
|
406
|
-
# @return [String]
|
407
|
-
def generate_output_id data
|
408
|
-
# Generate random UUID to match AnswersEngine behavior
|
409
|
-
self.class.fake_uuid
|
410
|
-
end
|
411
|
-
|
412
|
-
# Get output keys with key generators to emulate saving on db.
|
413
|
-
# @private
|
414
|
-
#
|
415
|
-
# @return [Hash]
|
416
|
-
def output_defaults
|
417
|
-
@output_defaults ||= {
|
418
|
-
'_collection' => DEFAULT_COLLECTION,
|
419
|
-
'_job_id' => lambda{|output| job_id},
|
420
|
-
'_created_at' => lambda{|output| self.class.time_stamp},
|
421
|
-
'_gid' => lambda{|output| page_gid}
|
422
|
-
}
|
423
|
-
end
|
424
|
-
|
425
|
-
# Stored output collection
|
426
|
-
#
|
427
|
-
# @return [AeEasy::Core::SmartCollection]
|
428
|
-
def outputs
|
429
|
-
return @outputs unless @outputs.nil?
|
430
|
-
collection = self.class.new_collection OUTPUT_KEYS,
|
431
|
-
defaults: output_defaults
|
432
|
-
collection.bind_event(:before_defaults) do |collection, raw_item|
|
433
|
-
item = AeEasy::Core.deep_stringify_keys raw_item
|
434
|
-
item.delete '_job_id' unless allow_job_id_override?
|
435
|
-
item.delete '_gid_id' unless allow_page_gid_override?
|
436
|
-
item
|
437
|
-
end
|
438
|
-
collection.bind_event(:before_insert) do |collection, item, match|
|
439
|
-
item['_id'] ||= generate_output_id item
|
440
|
-
item
|
441
|
-
end
|
442
|
-
collection.bind_event(:after_insert) do |collection, item|
|
443
|
-
ensure_job item['_job_id']
|
444
|
-
end
|
445
|
-
@outputs ||= collection
|
446
|
-
end
|
447
|
-
|
448
|
-
# Match data to filters.
|
449
|
-
# @private
|
450
|
-
#
|
451
|
-
# @param data Hash containing data.
|
452
|
-
# @param filters Filters to apply on match.
|
453
|
-
#
|
454
|
-
# @return [Boolean]
|
455
|
-
#
|
456
|
-
# @note Missing and `nil` values on `data` will match when `filters`'
|
457
|
-
# field is `nil`.
|
458
|
-
def match? data, filters
|
459
|
-
filters.each do |key, value|
|
460
|
-
return false if data[key] != value
|
461
|
-
end
|
462
|
-
true
|
463
|
-
end
|
464
|
-
|
465
|
-
# Search items from a collection.
|
466
|
-
#
|
467
|
-
# @param [Symbol] collection Allowed values: `:outputs`, `:pages`.
|
468
|
-
# @param [Hash] filter Filters to query.
|
469
|
-
# @param [Integer] offset (0) Search results offset.
|
470
|
-
# @param [Integer,nil] limit (nil) Limit search results count. Set to `nil` for unlimited.
|
471
|
-
#
|
472
|
-
# @raise ArgumentError On unknown collection.
|
473
|
-
#
|
474
|
-
# @note _Warning:_ It uses table scan to filter and should be used on test suites only.
|
475
|
-
def query collection, filter, offset = 0, limit = nil
|
476
|
-
return [] unless limit.nil? || limit > 0
|
477
|
-
|
478
|
-
# Get collection items
|
479
|
-
items = case collection
|
480
|
-
when :outputs
|
481
|
-
outputs
|
482
|
-
when :pages
|
483
|
-
pages
|
484
|
-
when :jobs
|
485
|
-
jobs
|
486
|
-
else
|
487
|
-
raise ArgumentError.new "Unknown collection #{collection}."
|
488
|
-
end
|
489
|
-
|
490
|
-
# Search items
|
491
|
-
count = 0
|
492
|
-
matches = []
|
493
|
-
items.each do |item|
|
494
|
-
next unless match? item, filter
|
495
|
-
count += 1
|
496
|
-
|
497
|
-
# Skip until offset
|
498
|
-
next unless offset < count
|
499
|
-
# Break on limit reach
|
500
|
-
break unless limit.nil? || matches.count < limit
|
501
|
-
matches << item
|
502
|
-
end
|
503
|
-
matches
|
504
|
-
end
|
505
|
-
|
506
|
-
# Refetch a page.
|
507
|
-
#
|
508
|
-
# @param [Integer] job_id Page's job_id to refetch.
|
509
|
-
# @param [String] gid Page's gid to refetch.
|
510
|
-
def refetch job_id, gid
|
511
|
-
page = pages.find_match('gid' => gid, 'job_id' => job_id)
|
512
|
-
raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
|
513
|
-
page['status'] = 'to_fetch'
|
514
|
-
page['freshness'] = self.class.time_stamp
|
515
|
-
page['to_fetch'] = self.class.time_stamp
|
516
|
-
page['fetched_from'] = nil
|
517
|
-
page['fetching_at'] = '2001-01-01T00:00:00Z'
|
518
|
-
page['fetched_at'] = nil
|
519
|
-
page['fetching_try_count'] = 0
|
520
|
-
page['effective_url'] = nil
|
521
|
-
page['parsing_at'] = nil
|
522
|
-
page['parsing_failed_at'] = nil
|
523
|
-
page['parsed_at'] = nil
|
524
|
-
page['parsing_try_count'] = 0
|
525
|
-
page['parsing_fail_count'] = 0
|
526
|
-
page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
|
527
|
-
page['response_checksum'] = nil
|
528
|
-
page['response_status'] = nil
|
529
|
-
page['response_status_code'] = nil
|
530
|
-
page['response_headers'] = nil
|
531
|
-
page['response_cookie'] = nil
|
532
|
-
page['response_proto'] = nil
|
533
|
-
page['content_type'] = nil
|
534
|
-
page['content_size'] = 0
|
535
|
-
page['failed_response_status_code'] = nil
|
536
|
-
page['failed_response_headers'] = nil
|
537
|
-
page['failed_response_cookie'] = nil
|
538
|
-
page['failed_effective_url'] = nil
|
539
|
-
page['failed_at'] = nil
|
540
|
-
page['failed_content_type'] = nil
|
541
|
-
end
|
542
|
-
|
543
|
-
# Reparse a page.
|
544
|
-
#
|
545
|
-
# @param [Integer] job_id Page's job_id to reparse.
|
546
|
-
# @param [String] gid Page's gid to reparse.
|
547
|
-
def reparse job_id, gid
|
548
|
-
page = pages.find_match('gid' => gid, 'job_id' => job_id)
|
549
|
-
raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
|
550
|
-
page['status'] = 'to_parse'
|
551
|
-
page['parsing_at'] = nil
|
552
|
-
page['parsing_failed_at'] = nil
|
553
|
-
page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
|
554
|
-
page['parsed_at'] = nil
|
555
|
-
page['parsing_try_count'] = 0
|
556
|
-
page['parsing_fail_count'] = 0
|
557
|
-
end
|
558
|
-
end
|
559
|
-
end
|
560
|
-
end
|
561
|
-
end
|