recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import contextlib
|
|
3
|
+
import csv
|
|
4
|
+
import datetime
|
|
5
|
+
import fcntl
|
|
6
|
+
import glob
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import shutil
|
|
12
|
+
import tempfile
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import IO, Any, Sequence, Union
|
|
15
|
+
|
|
16
|
+
from recurvedata.utils import helpers, shell
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
PathLike = Union[str, os.PathLike]
|
|
20
|
+
|
|
21
|
+
_csv_dialect_options = {
|
|
22
|
+
"delimiter": ",",
|
|
23
|
+
"quoting": csv.QUOTE_ALL,
|
|
24
|
+
"lineterminator": "\r\n",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def new_tempfile(suffix: str = "", prefix: str = None, dir: str = None) -> str:
|
|
29
|
+
"""Create a tempfile with a random filename.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
suffix: suffix of the filename
|
|
33
|
+
prefix: prefix of the filename
|
|
34
|
+
dir: directory to store the file
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
the filename
|
|
38
|
+
"""
|
|
39
|
+
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
|
40
|
+
kwargs = {"suffix": f"{ts}_{suffix}", "dir": dir}
|
|
41
|
+
if prefix:
|
|
42
|
+
kwargs["prefix"] = prefix
|
|
43
|
+
_, filename = tempfile.mkstemp(**kwargs)
|
|
44
|
+
return filename
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def new_tempdir(suffix: str = "", prefix: str = None, dir: str = None) -> str:
|
|
48
|
+
"""Create a tempdir with a random filename.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
suffix: suffix of the filename
|
|
52
|
+
prefix: prefix of the filename
|
|
53
|
+
dir: directory to store the file
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
the filename
|
|
57
|
+
"""
|
|
58
|
+
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
|
59
|
+
kwargs = {"suffix": f"{ts}_{suffix}", "dir": dir}
|
|
60
|
+
if prefix:
|
|
61
|
+
kwargs["prefix"] = prefix
|
|
62
|
+
return tempfile.mkdtemp(**kwargs)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def merge_files(
|
|
66
|
+
files: Sequence[PathLike],
|
|
67
|
+
filename: str = None,
|
|
68
|
+
num_skip_lines: int = 0,
|
|
69
|
+
delete: bool = True,
|
|
70
|
+
) -> str:
|
|
71
|
+
"""Concat multiple files into one.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
files: source file names
|
|
75
|
+
filename: target filename, will create a tempfile if not provided
|
|
76
|
+
num_skip_lines: skip n lines before merge into target file
|
|
77
|
+
delete: delete source files after being merged
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
the target filename
|
|
81
|
+
"""
|
|
82
|
+
if filename is None:
|
|
83
|
+
_, filename = tempfile.mkstemp()
|
|
84
|
+
|
|
85
|
+
if num_skip_lines:
|
|
86
|
+
with open(filename, "wb") as fout:
|
|
87
|
+
for f in files:
|
|
88
|
+
with open(f, "rb") as fin:
|
|
89
|
+
for _ in range(num_skip_lines):
|
|
90
|
+
fin.readline()
|
|
91
|
+
shutil.copyfileobj(fin, fout)
|
|
92
|
+
|
|
93
|
+
else:
|
|
94
|
+
if len(files) == 1 and delete:
|
|
95
|
+
os.rename(files[0], filename)
|
|
96
|
+
else:
|
|
97
|
+
# merge by `cat` for better performance
|
|
98
|
+
shell.run(f'cat {" ".join(files)} > {filename}', logger)
|
|
99
|
+
|
|
100
|
+
if delete:
|
|
101
|
+
remove_files_safely(files)
|
|
102
|
+
|
|
103
|
+
return filename
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def remove_lines_from_start(filename: PathLike, lines: int, inplace: bool = False) -> str:
|
|
107
|
+
"""Skip the first n lines of a file.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
filename: source file name
|
|
111
|
+
lines: number of lines to be skipped
|
|
112
|
+
inplace: modify the file in-place or not
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
the filename, will be the same as the source file if inplace is True
|
|
116
|
+
"""
|
|
117
|
+
tmp_file = new_tempfile()
|
|
118
|
+
with open(filename, "rb") as f_in, open(tmp_file, "wb") as f_out:
|
|
119
|
+
# skip the first n lines
|
|
120
|
+
for _ in range(lines):
|
|
121
|
+
next(f_in, None)
|
|
122
|
+
|
|
123
|
+
# copy the rest to another file
|
|
124
|
+
shutil.copyfileobj(f_in, f_out)
|
|
125
|
+
|
|
126
|
+
return replace_file_with_temp(tmp_file, filename, inplace)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def is_file_empty(filename: PathLike) -> bool:
|
|
130
|
+
"""Detect file is empty or not, the non-exists file is considered as empty"""
|
|
131
|
+
try:
|
|
132
|
+
return os.stat(filename).st_size == 0
|
|
133
|
+
except FileNotFoundError:
|
|
134
|
+
return True
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def remove_files(files: Sequence[PathLike]) -> None:
|
|
138
|
+
"""Remove files."""
|
|
139
|
+
file_list: list[PathLike] = helpers.ensure_list(files)
|
|
140
|
+
for f in file_list:
|
|
141
|
+
os.unlink(f)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def remove_files_safely(files: Sequence[PathLike]) -> None:
|
|
145
|
+
"""Remove files safely. Ignore the errors."""
|
|
146
|
+
with contextlib.suppress(OSError, TypeError, ValueError):
|
|
147
|
+
remove_files(files)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def remove_files_by_pattern(pattern: str) -> None:
|
|
151
|
+
"""Remove files by pattern. Ignore the errors."""
|
|
152
|
+
files = glob.glob(pattern)
|
|
153
|
+
logger.info("files to be deleted: %s", str(files))
|
|
154
|
+
remove_files_safely(files)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def remove_folder_safely(folder: str) -> None:
|
|
158
|
+
"""Remove folder safely. Ignore the errors."""
|
|
159
|
+
if not os.path.exists(folder):
|
|
160
|
+
return
|
|
161
|
+
shutil.rmtree(folder, ignore_errors=True)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@contextlib.contextmanager
|
|
165
|
+
def ensure_remove(filename: PathLike):
|
|
166
|
+
"""Remove file safely after using."""
|
|
167
|
+
try:
|
|
168
|
+
yield filename
|
|
169
|
+
finally:
|
|
170
|
+
remove_files_safely(filename)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def convert_excel_to_csv(
|
|
174
|
+
src_file: PathLike,
|
|
175
|
+
dst_file: PathLike = None,
|
|
176
|
+
skiprows: int = 0,
|
|
177
|
+
inplace: bool = True,
|
|
178
|
+
) -> str:
|
|
179
|
+
"""Convert an Excel file to a CSV file
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
src_file: the path of the Excel file
|
|
183
|
+
dst_file: the path of output file, a temporary filename will be made otherwise
|
|
184
|
+
skiprows: skip the first N rows
|
|
185
|
+
inplace: replace the original file if True
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
the target_filename
|
|
189
|
+
"""
|
|
190
|
+
import pandas as pd
|
|
191
|
+
|
|
192
|
+
if not dst_file:
|
|
193
|
+
dst_file = new_tempfile(dir=os.path.dirname(src_file))
|
|
194
|
+
|
|
195
|
+
df = pd.read_excel(src_file, skiprows=skiprows)
|
|
196
|
+
df.to_csv(dst_file, line_terminator="\r\n", header=False, index=False)
|
|
197
|
+
|
|
198
|
+
return replace_file_with_temp(dst_file, src_file, inplace)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def convert_jsonlines_to_csv(
|
|
202
|
+
src_file: PathLike,
|
|
203
|
+
dst_file: PathLike = None,
|
|
204
|
+
skiprows: int = 0,
|
|
205
|
+
src_encoding: str = "utf8",
|
|
206
|
+
inplace: bool = True,
|
|
207
|
+
) -> str:
|
|
208
|
+
"""Convert a JSON Lines file to a CSV file
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
src_file: the path of the JSON Lines file
|
|
212
|
+
dst_file: the path of output file, a temporary filename will be made otherwise
|
|
213
|
+
skiprows: skip the first N rows
|
|
214
|
+
src_encoding: the encoding of the JSON Lines file
|
|
215
|
+
inplace: replace the original file if True
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
the target_filename
|
|
219
|
+
"""
|
|
220
|
+
if not dst_file:
|
|
221
|
+
dst_file = new_tempfile(dir=os.path.dirname(src_file))
|
|
222
|
+
|
|
223
|
+
decoder = json.JSONDecoder(object_pairs_hook=collections.OrderedDict)
|
|
224
|
+
with open(src_file, "r", encoding=src_encoding) as f_in, open(dst_file, "w") as f_out:
|
|
225
|
+
_skip_header_rows(f_in, skiprows)
|
|
226
|
+
|
|
227
|
+
line = f_in.readline()
|
|
228
|
+
row = decoder.decode(line)
|
|
229
|
+
writer = csv.DictWriter(f_out, fieldnames=list(row.keys()), **_csv_dialect_options)
|
|
230
|
+
writer.writerow(row)
|
|
231
|
+
|
|
232
|
+
for line in f_in:
|
|
233
|
+
writer.writerow(decoder.decode(line))
|
|
234
|
+
|
|
235
|
+
return replace_file_with_temp(dst_file, src_file, inplace)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def convert_encoding(
|
|
239
|
+
filename: PathLike,
|
|
240
|
+
src_encoding: str,
|
|
241
|
+
dst_encoding: str = "utf8",
|
|
242
|
+
skiprows: int = 0,
|
|
243
|
+
inplace: bool = True,
|
|
244
|
+
) -> str:
|
|
245
|
+
"""Convert the encoding of a file
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
filename: the path of the file
|
|
249
|
+
src_encoding: the encoding of the file
|
|
250
|
+
dst_encoding: the encoding to convert to
|
|
251
|
+
skiprows: skip the first N rows
|
|
252
|
+
inplace: replace the original file if True
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
the target_filename
|
|
256
|
+
"""
|
|
257
|
+
if src_encoding == dst_encoding:
|
|
258
|
+
return filename
|
|
259
|
+
|
|
260
|
+
target = new_tempfile(dir=os.path.dirname(filename))
|
|
261
|
+
with open(filename, "r", encoding=src_encoding) as f_in, open(target, "w", encoding=dst_encoding) as f_out:
|
|
262
|
+
_skip_header_rows(f_in, skiprows)
|
|
263
|
+
shutil.copyfileobj(f_in, f_out)
|
|
264
|
+
|
|
265
|
+
return replace_file_with_temp(target, filename, inplace)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def convert_csv_dialect(
|
|
269
|
+
filename: PathLike,
|
|
270
|
+
src_dialect_options: dict[str, Any],
|
|
271
|
+
dst_dialect_options: dict[str, Any] = None,
|
|
272
|
+
skiprows: int = 0,
|
|
273
|
+
src_encoding: str = "utf8",
|
|
274
|
+
inplace: bool = True,
|
|
275
|
+
):
|
|
276
|
+
"""Convert the dialect of a CSV file
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
filename: the path of the CSV file
|
|
280
|
+
src_dialect_options: the dialect of the file
|
|
281
|
+
dst_dialect_options: the dialect to convert to
|
|
282
|
+
skiprows: skip the first N rows
|
|
283
|
+
src_encoding: the encoding of the file
|
|
284
|
+
inplace: replace the original file if True
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
the target_filename
|
|
288
|
+
"""
|
|
289
|
+
if dst_dialect_options is None:
|
|
290
|
+
dst_dialect_options = _csv_dialect_options.copy()
|
|
291
|
+
|
|
292
|
+
if _same_dict(src_dialect_options, dst_dialect_options):
|
|
293
|
+
if src_encoding != "utf8":
|
|
294
|
+
convert_encoding(filename, src_encoding=src_encoding, skiprows=skiprows, inplace=True)
|
|
295
|
+
return filename
|
|
296
|
+
|
|
297
|
+
dst_file = new_tempfile(dir=os.path.dirname(filename))
|
|
298
|
+
with open(filename, "r", encoding=src_encoding) as f_in, open(dst_file, "w") as f_out:
|
|
299
|
+
_skip_header_rows(f_in, skiprows)
|
|
300
|
+
|
|
301
|
+
reader = csv.reader(f_in, **src_dialect_options)
|
|
302
|
+
writer = csv.writer(f_out, **dst_dialect_options)
|
|
303
|
+
for row in reader:
|
|
304
|
+
writer.writerow(row)
|
|
305
|
+
|
|
306
|
+
return replace_file_with_temp(dst_file, filename, inplace)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def replace_file_with_temp(tmp_file: PathLike, target_file: PathLike, inplace: bool = False) -> PathLike:
|
|
310
|
+
"""Determine the filename of the converted file, and rename it if inplace is True"""
|
|
311
|
+
if inplace:
|
|
312
|
+
os.rename(tmp_file, target_file)
|
|
313
|
+
return target_file
|
|
314
|
+
return tmp_file
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _skip_header_rows(f: IO, n: int = 0):
|
|
318
|
+
for _ in range(n):
|
|
319
|
+
f.readline()
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _same_dict(a: dict, b: dict) -> bool:
|
|
323
|
+
if len(a) != len(b):
|
|
324
|
+
return False
|
|
325
|
+
for k in a:
|
|
326
|
+
if k not in b or a[k] != b[k]:
|
|
327
|
+
return False
|
|
328
|
+
return True
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def calculate_md5(filepath: Path | str) -> str:
|
|
332
|
+
md5_hash = hashlib.md5()
|
|
333
|
+
chunk_size = 1024 * 1024
|
|
334
|
+
with open(filepath, "rb") as f:
|
|
335
|
+
for chunk in iter(lambda: f.read(chunk_size), b""):
|
|
336
|
+
md5_hash.update(chunk)
|
|
337
|
+
|
|
338
|
+
return md5_hash.hexdigest()
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class FileLock:
|
|
342
|
+
"""A file lock using fcntl.
|
|
343
|
+
copy from recurve web
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
def __init__(self, lock_file_path: str | Path):
|
|
347
|
+
self.lock_file_path = Path(lock_file_path)
|
|
348
|
+
self.fd = None
|
|
349
|
+
|
|
350
|
+
def acquire(self):
|
|
351
|
+
try:
|
|
352
|
+
self.fd = self.lock_file_path.open("w")
|
|
353
|
+
# Acquire an exclusive lock, this will block until the lock is acquired
|
|
354
|
+
fcntl.flock(self.fd, fcntl.LOCK_EX)
|
|
355
|
+
except Exception as e:
|
|
356
|
+
self._reset()
|
|
357
|
+
raise e # Propagate unexpected exceptions
|
|
358
|
+
|
|
359
|
+
def release(self):
|
|
360
|
+
if not self.fd:
|
|
361
|
+
return
|
|
362
|
+
try:
|
|
363
|
+
fcntl.flock(self.fd, fcntl.LOCK_UN)
|
|
364
|
+
except Exception as e:
|
|
365
|
+
raise e # Propagate unexpected exceptions
|
|
366
|
+
finally:
|
|
367
|
+
self._reset()
|
|
368
|
+
|
|
369
|
+
def _reset(self):
|
|
370
|
+
if self.fd:
|
|
371
|
+
self.fd.close()
|
|
372
|
+
self.fd = None
|
|
373
|
+
|
|
374
|
+
def __enter__(self):
|
|
375
|
+
self.acquire()
|
|
376
|
+
return self
|
|
377
|
+
|
|
378
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
379
|
+
self.release()
|
|
380
|
+
|
|
381
|
+
def __del__(self):
|
|
382
|
+
try:
|
|
383
|
+
self.release()
|
|
384
|
+
except Exception:
|
|
385
|
+
# Suppress exceptions in __del__, as we've done our best
|
|
386
|
+
pass
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import os
|
|
3
|
+
from typing import Callable, Generator, Iterable, TypeVar, Union, overload
|
|
4
|
+
|
|
5
|
+
import cytoolz as toolz
|
|
6
|
+
|
|
7
|
+
from recurvedata.consts import ENV_ID_KEY
|
|
8
|
+
|
|
9
|
+
T = TypeVar("T")
|
|
10
|
+
_VT = TypeVar("_VT")
|
|
11
|
+
_KT = TypeVar("_KT")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Hash helpers
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_hash(v: Union[str, bytes], hash_func: Callable) -> str:
|
|
18
|
+
if isinstance(v, str):
|
|
19
|
+
v = v.encode()
|
|
20
|
+
if not isinstance(v, bytes):
|
|
21
|
+
v = str(v).encode()
|
|
22
|
+
return hash_func(v).hexdigest()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def sha256hash(v: Union[str, bytes]) -> str:
|
|
26
|
+
return _get_hash(v, hashlib.sha256)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def md5hash(v: Union[str, bytes]) -> str:
|
|
30
|
+
return _get_hash(v, hashlib.md5)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# String helpers
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def trim_prefix(s: str, sub: str) -> str:
|
|
37
|
+
if not s.startswith(sub):
|
|
38
|
+
return s
|
|
39
|
+
return s[len(sub) :]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def trim_suffix(s: str, sub: str) -> str:
|
|
43
|
+
if not s.endswith(sub):
|
|
44
|
+
return s
|
|
45
|
+
return s[: -len(sub)]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def truncate_string(s: str, length: int, replacer: str = "...") -> str:
|
|
49
|
+
if len(s) > length:
|
|
50
|
+
return s[:length] + replacer
|
|
51
|
+
return s
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def unescape_backslash(s: str) -> str:
|
|
55
|
+
return s.encode().decode("unicode_escape")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def safe_int(v: Union[str, int, float], default: int = 0) -> int:
|
|
59
|
+
try:
|
|
60
|
+
return int(v)
|
|
61
|
+
except Exception:
|
|
62
|
+
return default
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def safe_float(v: Union[str, int, float], default: float = 0.0) -> float:
|
|
66
|
+
try:
|
|
67
|
+
return float(v)
|
|
68
|
+
except Exception:
|
|
69
|
+
return default
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# Container helpers
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def first(seq: Iterable[T], default: T = None) -> T:
|
|
76
|
+
try:
|
|
77
|
+
return toolz.first(seq)
|
|
78
|
+
except StopIteration:
|
|
79
|
+
return default
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def chunkify(lst: list, size: int) -> Generator[list, None, None]:
|
|
83
|
+
for i in range(0, len(lst), size):
|
|
84
|
+
yield lst[i : i + size]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def extract_dict(d: dict[_KT, _VT], keys: Iterable[_KT]) -> dict[_KT, _VT]:
|
|
88
|
+
return {k: v for k, v in d.items() if k in keys}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def ensure_list(v: Union[T, Iterable[T]]) -> list[T]:
|
|
92
|
+
if isinstance(v, (tuple, set, list)):
|
|
93
|
+
return list(v)
|
|
94
|
+
return [v]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def ensure_str_list(v: str, sep: str = ",", strip: bool = True) -> list[str]:
|
|
98
|
+
if v is None:
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
if isinstance(v, str):
|
|
102
|
+
if not v:
|
|
103
|
+
return []
|
|
104
|
+
if strip:
|
|
105
|
+
return [x.strip() for x in v.split(sep)]
|
|
106
|
+
else:
|
|
107
|
+
return v.split(sep)
|
|
108
|
+
|
|
109
|
+
if isinstance(v, (tuple, set, list)):
|
|
110
|
+
return list(v)
|
|
111
|
+
raise TypeError(f'unsupported type "{type(v)}"')
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@overload
|
|
115
|
+
def replace_null_values(
|
|
116
|
+
row: list[T],
|
|
117
|
+
null_values: Union[list[T], set[T]],
|
|
118
|
+
replacer: T = None,
|
|
119
|
+
) -> list[T]:
|
|
120
|
+
...
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@overload
|
|
124
|
+
def replace_null_values(
|
|
125
|
+
row: tuple[T, ...],
|
|
126
|
+
null_values: Union[list[T], set[T]],
|
|
127
|
+
replacer: T = None,
|
|
128
|
+
) -> tuple[T, ...]:
|
|
129
|
+
...
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@overload
|
|
133
|
+
def replace_null_values(
|
|
134
|
+
row: dict[_KT, _VT],
|
|
135
|
+
null_values: Union[list[_VT], set[_VT]],
|
|
136
|
+
replacer: _VT = None,
|
|
137
|
+
) -> dict[_KT, _VT]:
|
|
138
|
+
...
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def replace_null_values(
|
|
142
|
+
row: Union[list[_VT], tuple[_VT, ...], dict[_KT, _VT]],
|
|
143
|
+
null_values: Union[list[_VT], set[_VT]],
|
|
144
|
+
replacer: _VT = None,
|
|
145
|
+
) -> Union[list[_VT], tuple[_VT, ...], dict[_KT, _VT]]:
|
|
146
|
+
def _f(v):
|
|
147
|
+
if v in null_values:
|
|
148
|
+
return replacer
|
|
149
|
+
return v
|
|
150
|
+
|
|
151
|
+
if isinstance(row, list):
|
|
152
|
+
return list(map(_f, row))
|
|
153
|
+
if isinstance(row, tuple):
|
|
154
|
+
return tuple(map(_f, row))
|
|
155
|
+
if isinstance(row, dict):
|
|
156
|
+
return toolz.valmap(_f, row)
|
|
157
|
+
raise TypeError(f"only list, tuple or dict type is supported, got {repr(type(row))}")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def get_env_id():
|
|
161
|
+
return int(os.environ[ENV_ID_KEY])
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def get_environment_variable(key: str, cast: Callable[[str], T] | None = None) -> T | None:
|
|
165
|
+
value = os.environ.get(key)
|
|
166
|
+
if value is None:
|
|
167
|
+
return None
|
|
168
|
+
if cast is not None:
|
|
169
|
+
return cast(value)
|
|
170
|
+
return value
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
import requests
|
|
10
|
+
import requests.adapters
|
|
11
|
+
from urllib3.util.retry import Retry
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def new_retry_session(
|
|
15
|
+
max_retries=3,
|
|
16
|
+
backoff_factor=0.3,
|
|
17
|
+
method_whitelist=None,
|
|
18
|
+
status_forcelist=(429, 500, 502, 503, 504),
|
|
19
|
+
session=None,
|
|
20
|
+
):
|
|
21
|
+
if not method_whitelist:
|
|
22
|
+
method_whitelist = Retry.DEFAULT_ALLOWED_METHODS
|
|
23
|
+
session = session or requests.Session()
|
|
24
|
+
retry = Retry(
|
|
25
|
+
total=max_retries,
|
|
26
|
+
read=max_retries,
|
|
27
|
+
connect=max_retries,
|
|
28
|
+
allowed_methods=method_whitelist,
|
|
29
|
+
backoff_factor=backoff_factor,
|
|
30
|
+
status_forcelist=status_forcelist,
|
|
31
|
+
)
|
|
32
|
+
adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=retry)
|
|
33
|
+
session.mount("http://", adapter)
|
|
34
|
+
session.mount("https://", adapter)
|
|
35
|
+
return session
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def download_file(url: str, filepath: str, **kwargs) -> str:
|
|
39
|
+
if os.path.isdir(filepath):
|
|
40
|
+
filename = url.split("/")[-1]
|
|
41
|
+
filepath = os.path.join(filepath, filename)
|
|
42
|
+
|
|
43
|
+
with requests.get(url, stream=True, **kwargs) as r:
|
|
44
|
+
with open(filepath, "wb") as f:
|
|
45
|
+
shutil.copyfileobj(r.raw, f)
|
|
46
|
+
|
|
47
|
+
return filepath
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
FQDN_RE = re.compile(r"^((?!-)[-A-Z\d]{1,62}(?<!-)\.)+[A-Z]{1,62}\.?$", re.IGNORECASE)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def is_valid_domain(host: str) -> bool:
|
|
54
|
+
if len(host) > 253:
|
|
55
|
+
return False
|
|
56
|
+
return bool(FQDN_RE.match(host))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def fill_scheme_to_url(url: str, scheme="https") -> str:
|
|
60
|
+
p = urllib.parse.urlparse(url)
|
|
61
|
+
# 有 scheme,不需要处理
|
|
62
|
+
if p.scheme != "":
|
|
63
|
+
return url
|
|
64
|
+
|
|
65
|
+
netloc = p.netloc or p.path
|
|
66
|
+
if "/" in netloc:
|
|
67
|
+
if netloc.startswith("://"):
|
|
68
|
+
return f"{scheme}{url}"
|
|
69
|
+
|
|
70
|
+
domain = netloc[: netloc.index("/")]
|
|
71
|
+
if not is_valid_domain(domain):
|
|
72
|
+
return url
|
|
73
|
+
path = p.path if p.netloc else ""
|
|
74
|
+
p = urllib.parse.ParseResult(scheme, netloc, path, *p[3:])
|
|
75
|
+
return p.geturl()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def ensure_url_list(s: str, fix_scheme=True) -> Optional[list[str]]:
|
|
79
|
+
if not s:
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
urls = json.loads(s)
|
|
84
|
+
except json.JSONDecodeError:
|
|
85
|
+
# try use comma as seperator
|
|
86
|
+
urls = s.split(",")
|
|
87
|
+
if fix_scheme:
|
|
88
|
+
urls = [fill_scheme_to_url(x) for x in urls]
|
|
89
|
+
return urls
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
async def forward(_request, endpoint: str):
|
|
93
|
+
"""
|
|
94
|
+
Forward a request to another server (received via FastAPI) and return the response.
|
|
95
|
+
We create a common method to use in multiple places.
|
|
96
|
+
"""
|
|
97
|
+
from fastapi import Request
|
|
98
|
+
from fastapi.responses import Response
|
|
99
|
+
|
|
100
|
+
request: Request = _request
|
|
101
|
+
method = request.method
|
|
102
|
+
headers = dict(request.headers)
|
|
103
|
+
params = request.query_params
|
|
104
|
+
body = await request.body()
|
|
105
|
+
|
|
106
|
+
async with httpx.AsyncClient() as client:
|
|
107
|
+
# Forward the request
|
|
108
|
+
response = await client.request(
|
|
109
|
+
method=method,
|
|
110
|
+
url=endpoint,
|
|
111
|
+
headers=headers,
|
|
112
|
+
params=params,
|
|
113
|
+
content=body,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Return the response from the target server
|
|
117
|
+
return Response(content=response.content, status_code=response.status_code, headers=dict(response.headers))
|