syncmodels 0.1.327__py2.py3-none-any.whl → 0.1.328__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syncmodels/__init__.py +1 -1
- syncmodels/definitions.py +1 -0
- syncmodels/logic/tasks.py +658 -0
- {syncmodels-0.1.327.dist-info → syncmodels-0.1.328.dist-info}/METADATA +2 -2
- {syncmodels-0.1.327.dist-info → syncmodels-0.1.328.dist-info}/RECORD +10 -9
- {syncmodels-0.1.327.dist-info → syncmodels-0.1.328.dist-info}/AUTHORS.rst +0 -0
- {syncmodels-0.1.327.dist-info → syncmodels-0.1.328.dist-info}/LICENSE +0 -0
- {syncmodels-0.1.327.dist-info → syncmodels-0.1.328.dist-info}/WHEEL +0 -0
- {syncmodels-0.1.327.dist-info → syncmodels-0.1.328.dist-info}/entry_points.txt +0 -0
- {syncmodels-0.1.327.dist-info → syncmodels-0.1.328.dist-info}/top_level.txt +0 -0
syncmodels/__init__.py
CHANGED
syncmodels/definitions.py
CHANGED
@@ -0,0 +1,658 @@
|
|
1
|
+
import asyncio
|
2
|
+
|
3
|
+
# import re
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
import time
|
7
|
+
import re
|
8
|
+
import traceback
|
9
|
+
import random
|
10
|
+
import pickle
|
11
|
+
import yaml
|
12
|
+
|
13
|
+
from datetime import datetime, timezone
|
14
|
+
from typing import List, Optional, Dict
|
15
|
+
|
16
|
+
from pydantic import BaseModel, Field, field_validator
|
17
|
+
|
18
|
+
from agptools.logs import logger
|
19
|
+
from agptools.containers import soft, overlap, filter_dict, filter_list
|
20
|
+
from agptools.loaders import ModuleLoader
|
21
|
+
|
22
|
+
# from syncmodels.crawler import iAsyncCrawler
|
23
|
+
from syncmodels.definitions import (
|
24
|
+
ORG_KEY,
|
25
|
+
WAVE_INFO_KEY,
|
26
|
+
WHERE_KEY,
|
27
|
+
KIND_KEY,
|
28
|
+
UID_TYPE,
|
29
|
+
)
|
30
|
+
|
31
|
+
# from syncmodels.mapper import Mapper, I, DATE, FLOAT
|
32
|
+
# from syncmodels.model import BaseModel, Datetime
|
33
|
+
# from syncmodels.session.http import HTTPSession
|
34
|
+
# from syncmodels.crud import DEFAULT_DATABASE
|
35
|
+
|
36
|
+
from syncmodels.storage import SurrealistStorage, WaveStorage
|
37
|
+
from syncmodels.syncmodels import SyncModel
|
38
|
+
from syncmodels.wave import TUBE_SNAPSHOT
|
39
|
+
from syncmodels.crawler import iAsyncCrawler
|
40
|
+
from syncmodels.model import Datetime
|
41
|
+
|
42
|
+
# from kraken.helpers import *
|
43
|
+
# from kraken.cli.main import main, CONTEXT_SETTINGS
|
44
|
+
# from kraken.cli.config import config
|
45
|
+
|
46
|
+
|
47
|
+
class Boostrap(BaseModel):
|
48
|
+
kind__: str = Field(
|
49
|
+
None,
|
50
|
+
# alias=KIND_KEY,
|
51
|
+
description="kind used for crawler mapper",
|
52
|
+
examples=[
|
53
|
+
"parking/status",
|
54
|
+
"/departures/traffictype/",
|
55
|
+
"vuelos-llegada",
|
56
|
+
"raw_waterconsumption",
|
57
|
+
"HOTEL_OCCUPATION",
|
58
|
+
],
|
59
|
+
)
|
60
|
+
method__: str = Field(
|
61
|
+
"get",
|
62
|
+
# alias=METHOD_KEY,
|
63
|
+
description="Session method to be used (default: get)",
|
64
|
+
examples=[
|
65
|
+
"get",
|
66
|
+
"post",
|
67
|
+
],
|
68
|
+
)
|
69
|
+
prefix__: str | None = Field(
|
70
|
+
None,
|
71
|
+
# alias=PREFIX_KEY,
|
72
|
+
description="Prefix used for storing the data, default None and use crawler built-in DEFAULT_PREFIX",
|
73
|
+
examples=[
|
74
|
+
None,
|
75
|
+
"transport://adif/trains:{{ id }}",
|
76
|
+
f"centesimal://centesimal/{{{KIND_KEY}}}" ":{{ id }}",
|
77
|
+
"stats://ine/dti/flows/{COD}:{Fecha}",
|
78
|
+
],
|
79
|
+
)
|
80
|
+
path__: Optional[str] = Field(
|
81
|
+
None,
|
82
|
+
# alias=PATH_KEY,
|
83
|
+
description="path for calling EndPoint",
|
84
|
+
examples=[
|
85
|
+
"/ftp/CPTEXP.TXT",
|
86
|
+
],
|
87
|
+
)
|
88
|
+
limit_value__: Optional[int] = Field(
|
89
|
+
None,
|
90
|
+
# alias=PATH_KEY,
|
91
|
+
description="limit of number of item on every cycle",
|
92
|
+
examples=[
|
93
|
+
1000,
|
94
|
+
],
|
95
|
+
)
|
96
|
+
grace__: Optional[int] = Field(
|
97
|
+
24 * 3600,
|
98
|
+
# alias=PATH_KEY,
|
99
|
+
description="grace period to protect same data duplication based on wave (seconds)",
|
100
|
+
examples=[
|
101
|
+
86400,
|
102
|
+
],
|
103
|
+
)
|
104
|
+
|
105
|
+
|
106
|
+
class Task(BaseModel):
|
107
|
+
"""A Kraken Task model
|
108
|
+
TODO: update in coockiecutter template
|
109
|
+
"""
|
110
|
+
|
111
|
+
id: Optional[UID_TYPE] = Field(
|
112
|
+
# "101_item",
|
113
|
+
description="Item unique identifier",
|
114
|
+
examples=[
|
115
|
+
"e81265d0ac1a5a4fb32fa2ab5a5bd930067b9153",
|
116
|
+
"fdfe1b7f64bbdb380661c60ed4bff1ae7692a1f1",
|
117
|
+
"c123576d726ab9973390e562d28c0f619fada79b",
|
118
|
+
"65aaa94497ce4474dd526c35e26dfcbe1b787fb0",
|
119
|
+
"17ccd7ba28b3b920217c215a18099d610de135de",
|
120
|
+
],
|
121
|
+
)
|
122
|
+
name: Optional[str] | None = Field(
|
123
|
+
# "kraken no name",
|
124
|
+
description="kraken name",
|
125
|
+
examples=[
|
126
|
+
"nice-item",
|
127
|
+
],
|
128
|
+
)
|
129
|
+
description: Optional[str] | None = Field(
|
130
|
+
# "a nice kraken object",
|
131
|
+
description="kraken human more descriptive name",
|
132
|
+
examples=[
|
133
|
+
"A Nice Item",
|
134
|
+
],
|
135
|
+
)
|
136
|
+
|
137
|
+
unit: Optional[str] = Field(
|
138
|
+
description="name/regexp to identify unit component",
|
139
|
+
examples=[
|
140
|
+
"smassa",
|
141
|
+
"aena",
|
142
|
+
"ine",
|
143
|
+
"adif",
|
144
|
+
],
|
145
|
+
)
|
146
|
+
crawler: Optional[str] = Field(
|
147
|
+
".",
|
148
|
+
description="name/regexp to identify Crawler Factory",
|
149
|
+
examples=[
|
150
|
+
"smassa",
|
151
|
+
"aena",
|
152
|
+
"ine",
|
153
|
+
"adif",
|
154
|
+
],
|
155
|
+
)
|
156
|
+
storage_url: Optional[str] | None = Field(
|
157
|
+
None,
|
158
|
+
description="url for storage server. Using None by default will use config file vaule",
|
159
|
+
examples=[
|
160
|
+
None,
|
161
|
+
"ws://localhost:12080",
|
162
|
+
],
|
163
|
+
)
|
164
|
+
fibers: Optional[int | None] = Field(
|
165
|
+
1,
|
166
|
+
description="num of fibers for crawler",
|
167
|
+
examples=[
|
168
|
+
1,
|
169
|
+
2,
|
170
|
+
],
|
171
|
+
)
|
172
|
+
restart: Optional[int | None] = Field(
|
173
|
+
60,
|
174
|
+
description="seconds to wait to restart the crawling process",
|
175
|
+
examples=[
|
176
|
+
60,
|
177
|
+
],
|
178
|
+
)
|
179
|
+
cycles: Optional[int | None] = Field(
|
180
|
+
None,
|
181
|
+
description="how many cycles the crawler must execute until task if finished (-1 means infinite)",
|
182
|
+
examples=[-1, 1, 5, 10],
|
183
|
+
)
|
184
|
+
cache_timeout: Optional[int | None] = Field(
|
185
|
+
60,
|
186
|
+
description="period in which same url+params requests will be ignored if it have been executed before",
|
187
|
+
examples=[60, 600],
|
188
|
+
)
|
189
|
+
bootstrap: List[Boostrap] | None = Field(
|
190
|
+
None,
|
191
|
+
description="a list of Bootstrap items to be used in crawler. Ommiting this parameter will use cralwer default_bootstrap() method",
|
192
|
+
)
|
193
|
+
# time
|
194
|
+
updated: Optional[Datetime] = Field(
|
195
|
+
None,
|
196
|
+
description="Task Update",
|
197
|
+
# pattern=r"\d+\-\d+\-\d+T\d+:\d+:\d+", # is already a datetime
|
198
|
+
)
|
199
|
+
paused: Optional[bool] = Field(
|
200
|
+
False,
|
201
|
+
description="determine if a task must be paused or not",
|
202
|
+
examples=[False, True],
|
203
|
+
)
|
204
|
+
|
205
|
+
@field_validator("id")
|
206
|
+
def convert_id(cls, value):
|
207
|
+
if not isinstance(value, UID_TYPE):
|
208
|
+
value = UID_TYPE(value)
|
209
|
+
# TODO: make some validations here
|
210
|
+
return value
|
211
|
+
|
212
|
+
|
213
|
+
from ..definitions import (
|
214
|
+
TASK_THING,
|
215
|
+
)
|
216
|
+
|
217
|
+
|
218
|
+
# ---------------------------------------------------------
|
219
|
+
# Loggers
|
220
|
+
# ---------------------------------------------------------
|
221
|
+
|
222
|
+
|
223
|
+
log = logger(__name__)
|
224
|
+
|
225
|
+
# ---------------------------------------------------------
|
226
|
+
# launch
|
227
|
+
# ---------------------------------------------------------
|
228
|
+
|
229
|
+
|
230
|
+
def launch(
|
231
|
+
task: Dict,
|
232
|
+
config_path: str | None = None,
|
233
|
+
pattern: List = None,
|
234
|
+
):
|
235
|
+
# for config_path in env.config_files:
|
236
|
+
# if os.access(config_path, os.F_OK):
|
237
|
+
# break
|
238
|
+
# else:
|
239
|
+
# config_path = None
|
240
|
+
#
|
241
|
+
# setup the syncmodel that holds all data
|
242
|
+
# url = env.surreal_url
|
243
|
+
|
244
|
+
if config_path:
|
245
|
+
cfg = yaml.load(open(config_path, "t", encoding="utf-8"), Loader=yaml.Loader)
|
246
|
+
else:
|
247
|
+
cfg = {}
|
248
|
+
|
249
|
+
if isinstance(task, Dict):
|
250
|
+
dtask = task
|
251
|
+
else:
|
252
|
+
dtask = task.model_dump()
|
253
|
+
|
254
|
+
soft(
|
255
|
+
dtask,
|
256
|
+
{
|
257
|
+
"task_filters": pattern,
|
258
|
+
},
|
259
|
+
)
|
260
|
+
url = dtask.get("storage_url") or cfg.get("storage_url")
|
261
|
+
surreal = SurrealistStorage(url)
|
262
|
+
storage = WaveStorage(storage=surreal)
|
263
|
+
syncmodel = SyncModel(
|
264
|
+
config_path=config_path,
|
265
|
+
storage=[
|
266
|
+
storage,
|
267
|
+
# yaml_wave,
|
268
|
+
],
|
269
|
+
)
|
270
|
+
|
271
|
+
loader = ModuleLoader("unit") # could be a module or a folder name
|
272
|
+
# available = loader.available_modules()
|
273
|
+
|
274
|
+
# loader = ModuleLoader(CRAWLER_UNITS_LOCATION)
|
275
|
+
# available = loader.available_modules()
|
276
|
+
|
277
|
+
crawler = dtask.get("crawler") or [".*"]
|
278
|
+
wanted = loader.available_modules([dtask.get("unit") or ".*"])
|
279
|
+
if len(wanted) > 1:
|
280
|
+
log.warning(
|
281
|
+
"Too many (%s) units found using: `%s` pattern",
|
282
|
+
len(wanted),
|
283
|
+
dtask.get("unit"),
|
284
|
+
)
|
285
|
+
for name in wanted:
|
286
|
+
log.info(" - %s", name)
|
287
|
+
log.info(
|
288
|
+
"try to use a closer filter to get the template unit, i.e. `%s`",
|
289
|
+
name,
|
290
|
+
)
|
291
|
+
else:
|
292
|
+
for i, mod in enumerate(loader.load_modules(wanted)):
|
293
|
+
# make introspection of the module
|
294
|
+
if not (inventory := getattr(mod, "inventory")):
|
295
|
+
log.error("Unit: %s has not `inventory` !!")
|
296
|
+
log.error("Review unit and provide the required initialization entry")
|
297
|
+
continue
|
298
|
+
|
299
|
+
try:
|
300
|
+
|
301
|
+
log.debug("module: [%s] inventory", mod.inventory)
|
302
|
+
_crawlers = mod.inventory[iAsyncCrawler.__name__]
|
303
|
+
log.info("found: [%s] crawler factories", len(_crawlers))
|
304
|
+
if _crawlers:
|
305
|
+
for _, (name, factory) in enumerate(_crawlers.items()):
|
306
|
+
log.info("- name: [%s] : factory: [%s]", name, factory)
|
307
|
+
if loader.match_any_regexp(factory.__name__, [crawler], re.I):
|
308
|
+
mod.default_setup()
|
309
|
+
# instance = factory(
|
310
|
+
# config_path=config_path,
|
311
|
+
# syncmodel=syncmodel,
|
312
|
+
# restart=task.restart,
|
313
|
+
# cycles=task.cycles,
|
314
|
+
# cache_timeout=task.cache_timeout,
|
315
|
+
# )
|
316
|
+
instance = factory(
|
317
|
+
config_path=config_path,
|
318
|
+
syncmodel=syncmodel,
|
319
|
+
**dtask,
|
320
|
+
# restart=task.restart,
|
321
|
+
# cycles=task.cycles,
|
322
|
+
# cache_timeout=task.cache_timeout,
|
323
|
+
)
|
324
|
+
# add bootstrap from task of let use the
|
325
|
+
# default_bootstrap() (instance.bootstrap=None)
|
326
|
+
if dtask.get("bootstrap"):
|
327
|
+
# check that bootstrap looks complete based
|
328
|
+
# on default_bootstrap
|
329
|
+
default_bootstrap = list(instance.default_bootstrap())
|
330
|
+
|
331
|
+
# default_bootstrap can have multiple options with a
|
332
|
+
# combination of kind__, path__, etc so I try to get
|
333
|
+
# the best match and complement the missing information
|
334
|
+
def best_boot(boot):
|
335
|
+
keys = set(boot)
|
336
|
+
best = {}
|
337
|
+
for candidate in default_bootstrap:
|
338
|
+
if candidate.get(KIND_KEY) != boot.get(
|
339
|
+
KIND_KEY
|
340
|
+
):
|
341
|
+
# can merge only with same keys
|
342
|
+
continue
|
343
|
+
if len(keys.intersection(candidate)) > len(
|
344
|
+
best
|
345
|
+
):
|
346
|
+
best = candidate
|
347
|
+
return best
|
348
|
+
|
349
|
+
bootstrap = []
|
350
|
+
for boot in dtask["bootstrap"]:
|
351
|
+
best = best_boot(boot)
|
352
|
+
# complement missing data from bootstrap
|
353
|
+
overlap(boot, best)
|
354
|
+
bootstrap.append(boot)
|
355
|
+
instance.bootstrap = bootstrap
|
356
|
+
|
357
|
+
# asyncio.run(instance.run())
|
358
|
+
yield instance
|
359
|
+
|
360
|
+
else:
|
361
|
+
log.error("No crawlers match [`%s`] name", crawler)
|
362
|
+
log.error("Candidates:")
|
363
|
+
for idx, (name, factory) in enumerate(_crawlers.items()):
|
364
|
+
log.error("[%s] %s : %s", idx, name, factory)
|
365
|
+
|
366
|
+
log.error(
|
367
|
+
"You must set a name/regexp that match any of these names"
|
368
|
+
)
|
369
|
+
|
370
|
+
except Exception as why:
|
371
|
+
log.error("%s", why)
|
372
|
+
log.error("Ignoring this module!!")
|
373
|
+
|
374
|
+
foo = 1
|
375
|
+
|
376
|
+
|
377
|
+
# ---------------------------------------------------------
|
378
|
+
# run_tasks
|
379
|
+
# ---------------------------------------------------------
|
380
|
+
|
381
|
+
|
382
|
+
async def update_task(task):
|
383
|
+
|
384
|
+
APP_NS = os.getenv("APP_NS", "myapp_ns")
|
385
|
+
APP_DB = os.getenv("APP_DB", "myapp_db")
|
386
|
+
|
387
|
+
cfg = dict(env.__dict__)
|
388
|
+
try:
|
389
|
+
url = cfg["surreal_url"]
|
390
|
+
config_path = cfg.get("config_path", "config.yaml")
|
391
|
+
surreal = SurrealistStorage(url)
|
392
|
+
storage = WaveStorage(storage=surreal)
|
393
|
+
syncmodel = SyncModel(
|
394
|
+
config_path=config_path,
|
395
|
+
storage=[
|
396
|
+
storage,
|
397
|
+
],
|
398
|
+
)
|
399
|
+
|
400
|
+
org_id = task.id
|
401
|
+
fqid = f"{APP_NS}://{APP_DB}/{TASK_THING}:{org_id}"
|
402
|
+
task.id = fqid
|
403
|
+
task.updated = datetime.now(tz=timezone.utc)
|
404
|
+
|
405
|
+
dtask = task.model_dump()
|
406
|
+
|
407
|
+
wave_info = {
|
408
|
+
WAVE_INFO_KEY: ["bootstrap"],
|
409
|
+
**dtask,
|
410
|
+
}
|
411
|
+
result = await syncmodel.put(task, **wave_info)
|
412
|
+
if result:
|
413
|
+
query = f"{APP_NS}://{APP_DB}/{TUBE_SNAPSHOT}"
|
414
|
+
params = {
|
415
|
+
ORG_KEY: fqid,
|
416
|
+
}
|
417
|
+
result = await syncmodel.query(query, **params)
|
418
|
+
for fqid, data in result.items():
|
419
|
+
stored_task = Task(**data)
|
420
|
+
# returns the task saved in storage as confirmation
|
421
|
+
return stored_task
|
422
|
+
|
423
|
+
msg = f"Saved task: {org_id} can't be retrieved back for checking"
|
424
|
+
else:
|
425
|
+
msg = f"Task {org_id} hasn't been saved in storage"
|
426
|
+
# return failed response
|
427
|
+
raise RuntimeError(msg)
|
428
|
+
|
429
|
+
except Exception as e:
|
430
|
+
log.error(e)
|
431
|
+
msg = "".join(traceback.format_exception(*sys.exc_info()))
|
432
|
+
log.error(msg)
|
433
|
+
raise RuntimeError(msg) from e
|
434
|
+
|
435
|
+
|
436
|
+
def run_tasks(
|
437
|
+
cycles=-1,
|
438
|
+
restart=60,
|
439
|
+
active_units=None,
|
440
|
+
active_crawlers=None,
|
441
|
+
ignore_units=None,
|
442
|
+
ignore_crawlers=None,
|
443
|
+
**context,
|
444
|
+
):
|
445
|
+
APP_NS = context.get("APP_NS", "kraken")
|
446
|
+
APP_DB = context.get("APP_DB", "kraken")
|
447
|
+
|
448
|
+
url = context.get("surreal_url")
|
449
|
+
config_path = context.get("config_path")
|
450
|
+
surreal = SurrealistStorage(url)
|
451
|
+
storage = WaveStorage(storage=surreal)
|
452
|
+
ctx = dict(storage=storage, dead=restart)
|
453
|
+
|
454
|
+
async def iteration_main():
|
455
|
+
|
456
|
+
syncmodel = SyncModel(
|
457
|
+
config_path=config_path,
|
458
|
+
storage=[
|
459
|
+
storage,
|
460
|
+
],
|
461
|
+
)
|
462
|
+
|
463
|
+
query = f"{APP_NS}://{APP_DB}/{TASK_THING}"
|
464
|
+
|
465
|
+
# params = request.params
|
466
|
+
# if params:
|
467
|
+
# params = params.model_dump()
|
468
|
+
# else:
|
469
|
+
# params = {
|
470
|
+
# MONOTONIC_KEY: 0,
|
471
|
+
# }
|
472
|
+
|
473
|
+
context.setdefault("storage_url", context.get("surreal_url"))
|
474
|
+
|
475
|
+
params = {
|
476
|
+
# key: value
|
477
|
+
# for key, value in params.items()
|
478
|
+
# if value not in (None, "")
|
479
|
+
WHERE_KEY: "crawler", # object has crawler keyword defined
|
480
|
+
}
|
481
|
+
|
482
|
+
task_definition = await syncmodel.query(query, **params)
|
483
|
+
last_task_definition = pickle.loads(pickle.dumps(task_definition))
|
484
|
+
|
485
|
+
timeout = 10
|
486
|
+
tasks = set()
|
487
|
+
instances = {}
|
488
|
+
launched_bootstraps = {}
|
489
|
+
loop = asyncio.get_running_loop()
|
490
|
+
t0 = loop.time()
|
491
|
+
delay = 0
|
492
|
+
restart = False
|
493
|
+
|
494
|
+
for _, (fquid, data) in enumerate(task_definition.items()):
|
495
|
+
data = overlap(data, context)
|
496
|
+
task = Task(**data)
|
497
|
+
_task = task.model_dump()
|
498
|
+
# add any extra special parameter that is not in the Task Model definition
|
499
|
+
extra = {k: v for k, v in data.items() if re.match(r".*__$", k)}
|
500
|
+
_task.update(extra)
|
501
|
+
|
502
|
+
if _paused := _task.get("paused"):
|
503
|
+
log.info("Ignoring Task")
|
504
|
+
continue
|
505
|
+
|
506
|
+
exclude = list(
|
507
|
+
filter_list(
|
508
|
+
universe=[_task.get("unit", "")], patterns=ignore_units or [r"(?!)"]
|
509
|
+
)
|
510
|
+
)
|
511
|
+
if exclude:
|
512
|
+
log.info("Ignoring task: %s due ignore_units=%s", _task, ignore_units)
|
513
|
+
continue
|
514
|
+
|
515
|
+
exclude = list(
|
516
|
+
filter_list(
|
517
|
+
universe=[_task.get("crawler", "")],
|
518
|
+
patterns=ignore_crawlers or [r"(?!)"],
|
519
|
+
)
|
520
|
+
)
|
521
|
+
if exclude:
|
522
|
+
log.info(
|
523
|
+
"Ignoring task: %s due ignore_crawlers=%s", _task, ignore_crawlers
|
524
|
+
)
|
525
|
+
continue
|
526
|
+
|
527
|
+
for _ in filter_dict(universe=[_task], patterns=active_units or (".",)):
|
528
|
+
# check that there is not any redundant task
|
529
|
+
samples = launched_bootstraps.setdefault(task.crawler, [])
|
530
|
+
arguments = _task["bootstrap"]
|
531
|
+
if arguments in samples:
|
532
|
+
log.error(
|
533
|
+
"[%s] A similar task has already launched",
|
534
|
+
task.crawler,
|
535
|
+
)
|
536
|
+
log.error("%s", arguments)
|
537
|
+
continue
|
538
|
+
else:
|
539
|
+
samples.append(arguments)
|
540
|
+
|
541
|
+
# ('[{"kind__": "parking/status", "method__": "get",
|
542
|
+
# "prefix__": null, "path__": null}]'))
|
543
|
+
|
544
|
+
# replace with the needed data only
|
545
|
+
task_definition[fquid] = _task
|
546
|
+
runners = list(launch(_task, pattern=active_crawlers or (".",)))
|
547
|
+
# TODO: update bootstrap definition to the storage
|
548
|
+
# TODO: so user can specify jus the crawler without
|
549
|
+
# TODO: any data, but the default bootstrap parameters
|
550
|
+
# TODO: will be saved back, so user can see current
|
551
|
+
# TODO: bootstrap implementation
|
552
|
+
delay += 1
|
553
|
+
instances[fquid] = [runners, t0 + delay * timeout]
|
554
|
+
|
555
|
+
while tasks or instances:
|
556
|
+
# start all waiting instances
|
557
|
+
now = loop.time()
|
558
|
+
for fquid, info in list(instances.items()):
|
559
|
+
if info[-1] < now:
|
560
|
+
crawlers = info[0]
|
561
|
+
if not crawlers:
|
562
|
+
log.warning("Unit found, but no crawler match criteria")
|
563
|
+
for crawler in info[0]:
|
564
|
+
coro = crawler.run()
|
565
|
+
task = loop.create_task(coro, name=fquid)
|
566
|
+
tasks.add(task)
|
567
|
+
log.info("Add a coro for task: %s", info)
|
568
|
+
info[-1] = float("inf")
|
569
|
+
instances.pop(fquid)
|
570
|
+
|
571
|
+
# wait task to finish for a short period
|
572
|
+
if not (tasks or instances):
|
573
|
+
break
|
574
|
+
|
575
|
+
foo = 2
|
576
|
+
|
577
|
+
if not tasks:
|
578
|
+
assert (
|
579
|
+
instances
|
580
|
+
), "we must be waiting for an instance that must be launched soon"
|
581
|
+
log.info(
|
582
|
+
"no task to wait, but there's [%s] tasks that will be launched soon",
|
583
|
+
len(instances),
|
584
|
+
)
|
585
|
+
# timeout = 1 # TODO: hack for faster debugging
|
586
|
+
await asyncio.sleep(timeout)
|
587
|
+
continue
|
588
|
+
done, tasks = await asyncio.wait(tasks, timeout=timeout)
|
589
|
+
|
590
|
+
# check if any task has been modified
|
591
|
+
new_task_definition = await syncmodel.query(query, **params)
|
592
|
+
|
593
|
+
# TODO: agp: restart only the affected tasks
|
594
|
+
if last_task_definition != new_task_definition:
|
595
|
+
log.info("Task definitions has change, restarting crawlers")
|
596
|
+
for task in tasks:
|
597
|
+
frame = task.get_coro().cr_frame.f_locals
|
598
|
+
crawler = frame.get("self")
|
599
|
+
if isinstance(crawler, iAsyncCrawler):
|
600
|
+
# request crawler to stop inmediately
|
601
|
+
# crawler.running = False
|
602
|
+
# request crawler to stop on the next cycle
|
603
|
+
crawler.cycles = 0
|
604
|
+
else:
|
605
|
+
log.error("self [%s] isn't instance of iAsyncCrawler")
|
606
|
+
restart = True
|
607
|
+
tasks.clear()
|
608
|
+
|
609
|
+
if done:
|
610
|
+
# some stats
|
611
|
+
log.info("[%s] tasks are still running", len(tasks))
|
612
|
+
# evaluate what to do with done tasks
|
613
|
+
for task in done:
|
614
|
+
fquid = task.get_name()
|
615
|
+
data = task_definition[fquid]
|
616
|
+
log.info(
|
617
|
+
"task: [%s] is finished: result: [%s]",
|
618
|
+
fquid,
|
619
|
+
task.result(),
|
620
|
+
)
|
621
|
+
for key, value in data.items():
|
622
|
+
log.info(
|
623
|
+
" - %s: %s",
|
624
|
+
key,
|
625
|
+
value,
|
626
|
+
)
|
627
|
+
|
628
|
+
data["cycles"] -= 1
|
629
|
+
if (
|
630
|
+
not restart and data["cycles"] != 0
|
631
|
+
): # TODO: set restarting POLICY
|
632
|
+
when = max(data["restart"], 60)
|
633
|
+
# a little shifted randomness that will spread the crawler load
|
634
|
+
when *= 0.975 + random.random() / 10
|
635
|
+
log.info(
|
636
|
+
"Daemon restart [%s] in [%s] secs: [%s]",
|
637
|
+
data["unit"],
|
638
|
+
when,
|
639
|
+
fquid,
|
640
|
+
)
|
641
|
+
instances[fquid][-1] = loop.time() + when
|
642
|
+
else:
|
643
|
+
# crawler is not restarted
|
644
|
+
instances.pop(fquid, None)
|
645
|
+
|
646
|
+
foo = 1
|
647
|
+
|
648
|
+
restart = max(restart, 15)
|
649
|
+
while cycles != 0:
|
650
|
+
|
651
|
+
asyncio.run(iteration_main())
|
652
|
+
cycles -= 1
|
653
|
+
if cycles:
|
654
|
+
log.info(
|
655
|
+
"waiting: [%s] secs before the next crawling iteration",
|
656
|
+
restart,
|
657
|
+
)
|
658
|
+
time.sleep(restart)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: syncmodels
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.328
|
4
4
|
Summary: Synchronizable Models
|
5
5
|
Home-page: https://github.com/asterio.gonzalez/syncmodels
|
6
6
|
Author: Asterio Gonzalez
|
@@ -18,7 +18,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
18
18
|
Requires-Python: >=3.6
|
19
19
|
License-File: LICENSE
|
20
20
|
License-File: AUTHORS.rst
|
21
|
-
Requires-Dist: agptools>=0.1.
|
21
|
+
Requires-Dist: agptools>=0.1.328
|
22
22
|
Requires-Dist: aiocache
|
23
23
|
Requires-Dist: aiohttp
|
24
24
|
Requires-Dist: Click
|
@@ -1,8 +1,8 @@
|
|
1
|
-
syncmodels/__init__.py,sha256=
|
1
|
+
syncmodels/__init__.py,sha256=AEXFwoTghbUYTPr0t_AaWqkpx8Md25aUAnPghgLHFu0,142
|
2
2
|
syncmodels/context.py,sha256=k1Gs_ip9BfyRFpyRnzqYvRDKo0sYBqJsh6z9sWln9oE,451
|
3
3
|
syncmodels/crawler.py,sha256=_pgelyrIKuVl8vdINJ6NSh5qkSnZf4rAACph4SZ_2H4,95281
|
4
4
|
syncmodels/crud.py,sha256=oZIcwEKR2i-lesEF_059Y4yThohd9m7gs6R6xYgLH-I,15351
|
5
|
-
syncmodels/definitions.py,sha256=
|
5
|
+
syncmodels/definitions.py,sha256=w-3TrSomp9T8OzLmJhKeZQDzrUIJLKldyh1lzlE7Yj0,5476
|
6
6
|
syncmodels/exceptions.py,sha256=ZLAwu19cs2UN2Sv3jaLnixT_jRI7T42TfyutCkUsuIk,685
|
7
7
|
syncmodels/geofactory.py,sha256=1FkrdEn0QA0O4_lSUAwjqXH2dmlQWi32AkntnG4AEQY,10372
|
8
8
|
syncmodels/http.py,sha256=FFVT3QJJgur2dv1Q_7l9ZsWN8z6_gUjOT9hJff1ZAqk,3335
|
@@ -40,6 +40,7 @@ syncmodels/logic/activity_logger.py,sha256=8wjvgRwaNbibYWGgl-trovSS70yNkoCTlb-AI
|
|
40
40
|
syncmodels/logic/analyzer.py,sha256=AJncaSGlgKKG2X-Y9-6_phBZkIz4CGWYLI2JSKqLhhI,12542
|
41
41
|
syncmodels/logic/browser.py,sha256=e0LdpHMmBaJ7cMdCFdIBLEfpewvO_mc5wqiCyNYPNoc,85197
|
42
42
|
syncmodels/logic/swarm.py,sha256=eRBVlNAOzzWKFGCb7LGLx2aj7yQlTY1OwLoeSEllvXY,17207
|
43
|
+
syncmodels/logic/tasks.py,sha256=7C6DYqbMOV9NTewg0qQRoCPo-k9424MlGR62ZkyPrCk,21962
|
43
44
|
syncmodels/mapper/__init__.py,sha256=jS82LFr9zzyqXBz82tSw04vDowhTpKxhg_W2XvhUlt0,129
|
44
45
|
syncmodels/mapper/fiware.py,sha256=auszPmhCS46z_68MXjksrQAFUfctjbVrVdBvOpOkMj8,523
|
45
46
|
syncmodels/mapper/mapper.py,sha256=SphMhr59bbTWWxnvitonURk3lSPDerGqUTs5-P-Tjlg,17397
|
@@ -302,10 +303,10 @@ syncmodels/session/postgresql.py,sha256=ZMIu1Rv93pKfvFlovFBmWArzlrT2xaQWNYGZT_LW
|
|
302
303
|
syncmodels/session/sql.py,sha256=7FbiKpnalFWkG9AEMDPloJ_IByyViwOR4EZRi1f7ulw,6570
|
303
304
|
syncmodels/session/sqlite.py,sha256=nCDjopLiBpX1F10qkKoARM7JrVdIpJ1WdGOduFVxaiA,2080
|
304
305
|
syncmodels/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
305
|
-
syncmodels-0.1.
|
306
|
-
syncmodels-0.1.
|
307
|
-
syncmodels-0.1.
|
308
|
-
syncmodels-0.1.
|
309
|
-
syncmodels-0.1.
|
310
|
-
syncmodels-0.1.
|
311
|
-
syncmodels-0.1.
|
306
|
+
syncmodels-0.1.328.dist-info/AUTHORS.rst,sha256=3ZPoqg8Aav8DSYKd0fwcwn4_5HwSiMLart0E5Un00-U,168
|
307
|
+
syncmodels-0.1.328.dist-info/LICENSE,sha256=uzMOYtIiUsnsD0xHJR7aJWJ4v_bvan0kTnvufy5eNoA,1075
|
308
|
+
syncmodels-0.1.328.dist-info/METADATA,sha256=pHc0EmSKcFt-iI___ND1Qmtn4jZ837szo_o-1dUelC4,2700
|
309
|
+
syncmodels-0.1.328.dist-info/WHEEL,sha256=SrDKpSbFN1G94qcmBqS9nyHcDMp9cUS9OC06hC0G3G0,109
|
310
|
+
syncmodels-0.1.328.dist-info/entry_points.txt,sha256=dMnigjZsHMxTwXiiZyBZdBbMYE0-hY3L5cG15EcDAzw,51
|
311
|
+
syncmodels-0.1.328.dist-info/top_level.txt,sha256=2DfQ9NuAhKMjY3BvQGVBA7GfqTm7EoHNbaehSUiqiHQ,11
|
312
|
+
syncmodels-0.1.328.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|