udata-hydra 2.0.5.dev5485__tar.gz → 2.0.5.dev5527__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/PKG-INFO +1 -1
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/pyproject.toml +1 -1
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/resource.py +6 -4
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/cli.py +2 -1
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/check_resources.py +14 -2
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/checks.py +6 -1
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/README.md +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/csv.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/errors.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/analysis/helpers.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/app.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/config_default.toml +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/context.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/helpers.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/process_check_data.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/select_batch.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/db/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/db/check.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/db/resource.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/db/resource_exception.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/logger.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/resources.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/resources_exceptions.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/resources_legacy.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/status.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/schemas/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/schemas/check.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/schemas/resource.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/schemas/resource_exception.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/auth.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/csv.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/db.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/file.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/http.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/minio.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/parquet.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/queue.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/reader.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/utils/timer.py +0 -0
- {udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/worker.py +0 -0
|
@@ -31,7 +31,9 @@ class Change(Enum):
|
|
|
31
31
|
log = logging.getLogger("udata-hydra")
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
async def analyse_resource(
|
|
34
|
+
async def analyse_resource(
|
|
35
|
+
check_id: int, is_first_check: bool, force_analysis: bool = False
|
|
36
|
+
) -> None:
|
|
35
37
|
"""
|
|
36
38
|
Perform analysis on the resource designated by check_id:
|
|
37
39
|
- change analysis
|
|
@@ -70,7 +72,7 @@ async def analyse_resource(check_id: int, is_first_check: bool) -> None:
|
|
|
70
72
|
# if the change status is NO_GUESS or HAS_CHANGED, let's download the file to get more infos
|
|
71
73
|
dl_analysis = {}
|
|
72
74
|
tmp_file = None
|
|
73
|
-
if change_status != Change.HAS_NOT_CHANGED:
|
|
75
|
+
if change_status != Change.HAS_NOT_CHANGED or force_analysis:
|
|
74
76
|
try:
|
|
75
77
|
tmp_file = await download_resource(url, headers, max_size_allowed)
|
|
76
78
|
except IOError:
|
|
@@ -107,7 +109,7 @@ async def analyse_resource(check_id: int, is_first_check: bool) -> None:
|
|
|
107
109
|
|
|
108
110
|
analysis_results = {**dl_analysis, **(change_payload or {})}
|
|
109
111
|
|
|
110
|
-
if change_status == Change.HAS_CHANGED or is_first_check:
|
|
112
|
+
if change_status == Change.HAS_CHANGED or is_first_check or force_analysis:
|
|
111
113
|
if is_tabular and tmp_file:
|
|
112
114
|
# Change status to TO_ANALYSE_CSV
|
|
113
115
|
await Resource.update(resource_id, data={"status": "TO_ANALYSE_CSV"})
|
|
@@ -260,7 +262,7 @@ async def detect_resource_change_on_early_hints(
|
|
|
260
262
|
|
|
261
263
|
|
|
262
264
|
async def detect_resource_change_from_harvest(
|
|
263
|
-
checks_data:
|
|
265
|
+
checks_data: tuple, resource_id: str
|
|
264
266
|
) -> tuple[Change, dict | None]:
|
|
265
267
|
"""
|
|
266
268
|
Checks if resource has a harvest.modified_at
|
|
@@ -137,7 +137,7 @@ async def crawl_url(url: str, method: str = "get"):
|
|
|
137
137
|
|
|
138
138
|
|
|
139
139
|
@cli
|
|
140
|
-
async def check_resource(resource_id: str, method: str = "get"):
|
|
140
|
+
async def check_resource(resource_id: str, method: str = "get", force_analysis: bool = True):
|
|
141
141
|
"""Trigger a complete check for a given resource_id"""
|
|
142
142
|
resource: asyncpg.Record | None = await Resource.get(resource_id)
|
|
143
143
|
if not resource:
|
|
@@ -149,6 +149,7 @@ async def check_resource(resource_id: str, method: str = "get"):
|
|
|
149
149
|
resource_id=resource_id,
|
|
150
150
|
session=session,
|
|
151
151
|
method=method,
|
|
152
|
+
force_analysis=force_analysis,
|
|
152
153
|
worker_priority="high",
|
|
153
154
|
)
|
|
154
155
|
|
{udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/check_resources.py
RENAMED
|
@@ -61,6 +61,7 @@ async def check_resource(
|
|
|
61
61
|
sleep: float = 0,
|
|
62
62
|
method: str = "head",
|
|
63
63
|
worker_priority: str = "default",
|
|
64
|
+
force_analysis: bool = False,
|
|
64
65
|
) -> str:
|
|
65
66
|
log.debug(f"check {url}, sleep {sleep}, method {method}")
|
|
66
67
|
|
|
@@ -101,7 +102,12 @@ async def check_resource(
|
|
|
101
102
|
end = time.time()
|
|
102
103
|
if method != "get" and not has_nice_head(resp):
|
|
103
104
|
return await check_resource(
|
|
104
|
-
url,
|
|
105
|
+
url,
|
|
106
|
+
resource_id,
|
|
107
|
+
session,
|
|
108
|
+
force_analysis=force_analysis,
|
|
109
|
+
method="get",
|
|
110
|
+
worker_priority=worker_priority,
|
|
105
111
|
)
|
|
106
112
|
resp.raise_for_status()
|
|
107
113
|
|
|
@@ -122,7 +128,13 @@ async def check_resource(
|
|
|
122
128
|
await Resource.update(resource_id, data={"status": "TO_ANALYSE_RESOURCE"})
|
|
123
129
|
|
|
124
130
|
# Enqueue the resource for analysis
|
|
125
|
-
queue.enqueue(
|
|
131
|
+
queue.enqueue(
|
|
132
|
+
analyse_resource,
|
|
133
|
+
check["id"],
|
|
134
|
+
is_first_check,
|
|
135
|
+
force_analysis,
|
|
136
|
+
_priority=worker_priority,
|
|
137
|
+
)
|
|
126
138
|
|
|
127
139
|
return RESOURCE_RESPONSE_STATUSES["OK"]
|
|
128
140
|
|
|
@@ -64,6 +64,7 @@ async def create_check(request: web.Request) -> web.Response:
|
|
|
64
64
|
try:
|
|
65
65
|
payload: dict = await request.json()
|
|
66
66
|
resource_id: str = payload["resource_id"]
|
|
67
|
+
force_analysis: bool = payload.get("force_analysis", True)
|
|
67
68
|
except Exception as err:
|
|
68
69
|
raise web.HTTPBadRequest(text=json.dumps({"error": str(err)}))
|
|
69
70
|
|
|
@@ -80,7 +81,11 @@ async def create_check(request: web.Request) -> web.Response:
|
|
|
80
81
|
timeout=None, headers={"user-agent": config.USER_AGENT}
|
|
81
82
|
) as session:
|
|
82
83
|
status: str = await check_resource(
|
|
83
|
-
url=url,
|
|
84
|
+
url=url,
|
|
85
|
+
resource_id=resource_id,
|
|
86
|
+
force_analysis=force_analysis,
|
|
87
|
+
session=session,
|
|
88
|
+
worker_priority="high",
|
|
84
89
|
)
|
|
85
90
|
context.monitor().refresh(status)
|
|
86
91
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/crawl/process_check_data.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/db/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/resources_exceptions.py
RENAMED
|
File without changes
|
{udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/routes/resources_legacy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.0.5.dev5485 → udata_hydra-2.0.5.dev5527}/udata_hydra/schemas/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|