udata-hydra 2.0.5.dev5384__tar.gz → 2.0.5.dev5485__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/PKG-INFO +1 -1
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/pyproject.toml +1 -1
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/analysis/csv.py +5 -5
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/analysis/resource.py +1 -1
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/crawl/helpers.py +5 -3
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/routes/status.py +1 -1
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/auth.py +5 -5
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/README.md +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/analysis/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/analysis/errors.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/analysis/helpers.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/app.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/cli.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/config_default.toml +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/context.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/crawl/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/crawl/check_resources.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/crawl/process_check_data.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/crawl/select_batch.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/db/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/db/check.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/db/resource.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/db/resource_exception.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/logger.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/csv/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/csv/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/csv/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/csv/20240827_add_indexes_column_to_tables_index_table.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20221205_initial_up_rev1.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20221206_rev1_up_rev2.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20221206_rev2_up_rev3.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20221208_rev3_up_rev4.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20221208_rev4_up_rev5.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20230119_rev5_up_rev6.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20230121_rev6_up_rev7.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20230121_rev7_up_rev8.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20230130_drop_migrations.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20230206_datetime_aware.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20230515_rev8_up_rev9.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20230606_rev9_up_rev10.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20231102_drop_csv_analysis.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20240827_add_resources_exceptions_table.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20240926_add_indexes.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20241004_add_comment_column_to_resources_exceptions.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20241021_add_parquet_columns.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/migrations/main/20241023_alter_foreign_key.sql +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/routes/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/routes/checks.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/routes/resources.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/routes/resources_exceptions.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/routes/resources_legacy.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/schemas/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/schemas/check.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/schemas/resource.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/schemas/resource_exception.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/__init__.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/csv.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/db.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/file.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/http.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/minio.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/parquet.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/queue.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/reader.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/utils/timer.py +0 -0
- {udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/worker.py +0 -0
|
@@ -285,13 +285,13 @@ async def csv_to_parquet(
|
|
|
285
285
|
Convert a csv file to parquet using inspection data.
|
|
286
286
|
|
|
287
287
|
Args:
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
288
|
+
file_path: CSV file path to convert.
|
|
289
|
+
inspection: CSV detective report.
|
|
290
|
+
table_name: used to name the parquet file.
|
|
291
291
|
|
|
292
292
|
Returns:
|
|
293
|
-
|
|
294
|
-
|
|
293
|
+
parquet_url: URL of the parquet file.
|
|
294
|
+
parquet_size: size of the parquet file.
|
|
295
295
|
"""
|
|
296
296
|
if not config.CSV_TO_PARQUET:
|
|
297
297
|
log.debug("CSV_TO_PARQUET turned off, skipping parquet export.")
|
|
@@ -260,7 +260,7 @@ async def detect_resource_change_on_early_hints(
|
|
|
260
260
|
|
|
261
261
|
|
|
262
262
|
async def detect_resource_change_from_harvest(
|
|
263
|
-
checks_data:
|
|
263
|
+
checks_data: list, resource_id: str
|
|
264
264
|
) -> tuple[Change, dict | None]:
|
|
265
265
|
"""
|
|
266
266
|
Checks if resource has a harvest.modified_at
|
|
@@ -22,8 +22,7 @@ async def get_content_type_from_header(headers: dict) -> str:
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def convert_headers(headers: CIMultiDictProxy) -> dict:
|
|
25
|
-
"""
|
|
26
|
-
Convert headers from aiohttp CIMultiDict type to dict type
|
|
25
|
+
"""Convert headers from aiohttp CIMultiDict type to dict type.
|
|
27
26
|
|
|
28
27
|
:warning: this will only take the first value for a given header key but multidict is not json serializable
|
|
29
28
|
"""
|
|
@@ -68,7 +67,10 @@ async def is_domain_backoff(domain: str) -> tuple[bool, str]:
|
|
|
68
67
|
"""Check if we should not crawl on this domain, in order to avoid 429 errors/bans as much as we can. We backoff if:
|
|
69
68
|
- we have hit a 429
|
|
70
69
|
- we have hit the rate limit on our side
|
|
71
|
-
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
A boolean indicating if it should backoff or not
|
|
73
|
+
A string with the message why we should backoff
|
|
72
74
|
"""
|
|
73
75
|
backoff: tuple = (False, "")
|
|
74
76
|
|
|
@@ -95,7 +95,7 @@ async def get_stats(request: web.Request) -> web.Response:
|
|
|
95
95
|
"""
|
|
96
96
|
stats_status = await request.app["pool"].fetchrow(q)
|
|
97
97
|
|
|
98
|
-
def cmp_rate(key):
|
|
98
|
+
def cmp_rate(key: str) -> float | int:
|
|
99
99
|
if stats_catalog["count_checked"] == 0:
|
|
100
100
|
return 0
|
|
101
101
|
return round(stats_status[key] / stats_catalog["count_checked"] * 100, 1)
|
|
@@ -24,13 +24,13 @@ def token_auth_middleware(
|
|
|
24
24
|
Token auth middleware that checks the "Authorization" http header for token and, if the token in the requet headers is valid, then middleware adds the user to request with key that contain the "request_property" variable, else it will raise an HTTPForbiddenexception.
|
|
25
25
|
|
|
26
26
|
Args:
|
|
27
|
-
request_property
|
|
27
|
+
request_property: Key for save in request object.
|
|
28
28
|
Defaults to 'user'.
|
|
29
|
-
auth_scheme
|
|
29
|
+
auth_scheme: Prefix for value in "Authorization" header.
|
|
30
30
|
Defaults to 'Bearer'.
|
|
31
|
-
exclude_routes:
|
|
31
|
+
exclude_routes: Tuple of pathes that will be excluded.
|
|
32
32
|
Defaults to empty tuple.
|
|
33
|
-
exclude_methods
|
|
33
|
+
exclude_methods: Tuple of http methods that will be
|
|
34
34
|
excluded. Defaults to empty tuple.
|
|
35
35
|
|
|
36
36
|
Raises:
|
|
@@ -38,7 +38,7 @@ def token_auth_middleware(
|
|
|
38
38
|
web.HTTPForbidden: Wrong token, schema or header.
|
|
39
39
|
|
|
40
40
|
Returns:
|
|
41
|
-
|
|
41
|
+
Aiohttp middleware.
|
|
42
42
|
"""
|
|
43
43
|
|
|
44
44
|
@web.middleware
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/crawl/check_resources.py
RENAMED
|
File without changes
|
{udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/crawl/process_check_data.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/db/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/routes/resources_exceptions.py
RENAMED
|
File without changes
|
{udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/routes/resources_legacy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{udata_hydra-2.0.5.dev5384 → udata_hydra-2.0.5.dev5485}/udata_hydra/schemas/resource_exception.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|