simtoolsz 0.2.2__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/PKG-INFO +1 -1
- simtoolsz-0.2.5/docs/special2db_usage.md +106 -0
- simtoolsz-0.2.5/examples/special2db_example.py +144 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/examples/zip2db_example.py +1 -1
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/pyproject.toml +1 -1
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/src/simtoolsz/__init__.py +4 -3
- simtoolsz-0.2.5/src/simtoolsz/db.py +373 -0
- simtoolsz-0.2.5/src/simtoolsz/reader.py +260 -0
- simtoolsz-0.2.5/test_optimized_reader.py +110 -0
- simtoolsz-0.2.5/tests/test_special2db.py +221 -0
- simtoolsz-0.2.5/tests/test_special2db_simple.py +157 -0
- simtoolsz-0.2.2/src/simtoolsz/io.py +0 -126
- simtoolsz-0.2.2/src/simtoolsz/youtube.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/.github/workflows/publish.yml +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/.gitignore +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/.python-version +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/LICENSE +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/README.md +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/README_EN.md +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/docs/DATETIME_CONVERSION.md +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/docs/mail_usage_guide.md +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/examples/conversion_examples.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/examples/mail_examples.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/examples/today_examples.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/requirements-dev.lock +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/requirements.lock +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/src/simtoolsz/datetime.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/src/simtoolsz/mail.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/src/simtoolsz/utils.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/tests/test_conversion.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/tests/test_iso_comprehensive.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/tests/test_iso_format.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/tests/test_simple.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/tests/test_today_optimized.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/tests/test_which_format.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/tests/test_zip2db.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/tests/test_zip2db_simple.py +0 -0
- {simtoolsz-0.2.2 → simtoolsz-0.2.5}/tests/verify_unicode_fix.py +0 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# special2db 函数使用指南
|
|
2
|
+
|
|
3
|
+
`special2db` 函数用于将特殊格式的数据文件(TSV、Avro、Arrow)转换为DuckDB数据库。
|
|
4
|
+
|
|
5
|
+
## 支持的文件格式
|
|
6
|
+
|
|
7
|
+
- **TSV**: 制表符分隔的文本文件
|
|
8
|
+
- **Avro**: Apache Avro格式文件
|
|
9
|
+
- **Arrow**: Apache Arrow格式文件
|
|
10
|
+
|
|
11
|
+
## 基本用法
|
|
12
|
+
|
|
13
|
+
### 读取单个TSV文件
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from simtoolsz.db import special2db
|
|
17
|
+
|
|
18
|
+
# 读取单个TSV文件到DuckDB
|
|
19
|
+
con = special2db('data/users.tsv', 'users.db')
|
|
20
|
+
|
|
21
|
+
# 查询数据
|
|
22
|
+
tables = con.execute("SHOW TABLES").fetchall()
|
|
23
|
+
count = con.execute("SELECT COUNT(*) FROM users").fetchone()[0]
|
|
24
|
+
print(f"记录数: {count}")
|
|
25
|
+
|
|
26
|
+
# 关闭连接
|
|
27
|
+
con.close()
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### 使用自定义表名
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
# 使用自定义表名
|
|
34
|
+
con = special2db('data/customers.tsv', 'customers.db', table='客户表')
|
|
35
|
+
|
|
36
|
+
# 查询自定义表名的数据
|
|
37
|
+
count = con.execute("SELECT COUNT(*) FROM 客户表").fetchone()[0]
|
|
38
|
+
print(f"客户表记录数: {count}")
|
|
39
|
+
con.close()
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### 处理目录中的多个文件
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
# 处理目录中的所有TSV文件
|
|
46
|
+
con = special2db('data_directory', 'all_data.db')
|
|
47
|
+
|
|
48
|
+
# 查看所有创建的表
|
|
49
|
+
tables = con.execute("SHOW TABLES").fetchall()
|
|
50
|
+
print(f"创建的表: {[table[0] for table in tables]}")
|
|
51
|
+
|
|
52
|
+
# 查询特定表
|
|
53
|
+
count = con.execute("SELECT COUNT(*) FROM users").fetchone()[0]
|
|
54
|
+
print(f"users表记录数: {count}")
|
|
55
|
+
con.close()
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### 指定读取参数
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
# 指定编码和其他参数
|
|
62
|
+
con = special2db('data/data.tsv', 'output.db',
|
|
63
|
+
encoding='utf-8',
|
|
64
|
+
header=True)
|
|
65
|
+
|
|
66
|
+
# 查询数据
|
|
67
|
+
tables = con.execute("SHOW TABLES").fetchall()
|
|
68
|
+
print(f"创建的表: {[table[0] for table in tables]}")
|
|
69
|
+
con.close()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## 错误处理
|
|
73
|
+
|
|
74
|
+
函数会在以下情况下抛出 `ValueError`:
|
|
75
|
+
|
|
76
|
+
1. **找不到支持的数据文件**
|
|
77
|
+
```python
|
|
78
|
+
# 目录中没有TSV、Avro或Arrow文件
|
|
79
|
+
con = special2db('empty_directory', 'output.db')
|
|
80
|
+
# ValueError: 未找到支持的数据文件(tsv、avro、arrow)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
2. **不支持的文件格式**
|
|
84
|
+
```python
|
|
85
|
+
# 尝试读取不支持的文件格式
|
|
86
|
+
con = special2db('data/file.txt', 'output.db')
|
|
87
|
+
# ValueError: 不支持的文件格式: .txt。支持的格式: tsv, avro, arrow
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## 表名处理规则
|
|
91
|
+
|
|
92
|
+
1. **默认表名**: 使用文件名(不含扩展名)
|
|
93
|
+
2. **自定义表名**: 通过 `table` 参数指定
|
|
94
|
+
3. **表名清理**: 自动移除特殊字符,只保留字母、数字和下划线
|
|
95
|
+
4. **多文件处理**: 每个文件创建一个表,表名基于文件名
|
|
96
|
+
|
|
97
|
+
## 注意事项
|
|
98
|
+
|
|
99
|
+
1. **数据库连接**: 函数返回DuckDB连接对象,使用完毕后需要手动关闭
|
|
100
|
+
2. **表覆盖**: 如果表已存在,会先删除再重新创建
|
|
101
|
+
3. **文件锁定**: 确保在使用完连接后关闭,避免文件锁定问题
|
|
102
|
+
4. **临时文件**: 示例中使用临时文件时,注意文件生命周期管理
|
|
103
|
+
|
|
104
|
+
## 完整示例
|
|
105
|
+
|
|
106
|
+
参考 `examples/special2db_example.py` 文件获取完整的使用示例。
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
special2db 函数使用示例
|
|
3
|
+
|
|
4
|
+
这个示例演示如何使用 special2db 函数将特殊格式(TSV、Avro、Arrow)的数据文件转换为DuckDB数据库。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import tempfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from simtoolsz.db import special2db
|
|
10
|
+
|
|
11
|
+
def create_sample_data():
|
|
12
|
+
"""创建示例数据"""
|
|
13
|
+
|
|
14
|
+
tmpdir = Path(tempfile.mkdtemp())
|
|
15
|
+
|
|
16
|
+
# 创建TSV文件 - 用户数据
|
|
17
|
+
tsv_content = """user_id\tname\tage\tcity\temail
|
|
18
|
+
1\tAlice\t25\tNew York\talice@example.com
|
|
19
|
+
2\tBob\t30\tLondon\tbob@example.com
|
|
20
|
+
3\tCharlie\t35\tTokyo\tcharlie@example.com
|
|
21
|
+
4\tDiana\t28\tParis\tdiana@example.com
|
|
22
|
+
5\tEve\t32\tBerlin\teve@example.com"""
|
|
23
|
+
tsv_file = tmpdir / 'users.tsv'
|
|
24
|
+
tsv_file.write_text(tsv_content, encoding='utf-8')
|
|
25
|
+
|
|
26
|
+
print(f"示例数据已创建在: {tmpdir}")
|
|
27
|
+
print(f"TSV文件: {tsv_file}")
|
|
28
|
+
|
|
29
|
+
return tmpdir, tsv_file
|
|
30
|
+
|
|
31
|
+
def main():
|
|
32
|
+
"""主函数"""
|
|
33
|
+
print("=== special2db 函数使用示例 ===\n")
|
|
34
|
+
|
|
35
|
+
tmpdir, tsv_file = create_sample_data()
|
|
36
|
+
|
|
37
|
+
# 示例1: 读取单个TSV文件
|
|
38
|
+
print("--- 示例1: 读取单个TSV文件 ---")
|
|
39
|
+
with tempfile.TemporaryDirectory() as test_dir:
|
|
40
|
+
test_dir = Path(test_dir)
|
|
41
|
+
db_file = test_dir / 'users.db'
|
|
42
|
+
|
|
43
|
+
con = special2db(tsv_file, db_file)
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
# 显示表信息
|
|
47
|
+
tables = con.execute("SHOW TABLES").fetchall()
|
|
48
|
+
print(f"创建的表: {[table[0] for table in tables]}")
|
|
49
|
+
|
|
50
|
+
# 显示数据
|
|
51
|
+
if 'users' in [t[0] for t in tables]:
|
|
52
|
+
count = con.execute("SELECT COUNT(*) FROM users").fetchone()[0]
|
|
53
|
+
print(f"记录数: {count}")
|
|
54
|
+
|
|
55
|
+
# 显示表结构
|
|
56
|
+
columns = con.execute("DESCRIBE users").fetchall()
|
|
57
|
+
print(f"列信息: {[(col[0], col[1]) for col in columns]}")
|
|
58
|
+
|
|
59
|
+
# 显示前3条数据
|
|
60
|
+
data = con.execute("SELECT * FROM users LIMIT 3").fetchall()
|
|
61
|
+
print(f"前3条数据:")
|
|
62
|
+
for row in data:
|
|
63
|
+
print(f" {row}")
|
|
64
|
+
finally:
|
|
65
|
+
con.close()
|
|
66
|
+
|
|
67
|
+
# 示例2: 使用自定义表名
|
|
68
|
+
print("\n--- 示例2: 使用自定义表名 ---")
|
|
69
|
+
with tempfile.TemporaryDirectory() as test_dir:
|
|
70
|
+
test_dir = Path(test_dir)
|
|
71
|
+
db_file = test_dir / 'custom_name.db'
|
|
72
|
+
|
|
73
|
+
con = special2db(tsv_file, db_file, table='客户表')
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
tables = con.execute("SHOW TABLES").fetchall()
|
|
77
|
+
print(f"创建的表: {[table[0] for table in tables]}")
|
|
78
|
+
|
|
79
|
+
if '客户表' in [t[0] for t in tables]:
|
|
80
|
+
count = con.execute("SELECT COUNT(*) FROM 客户表").fetchone()[0]
|
|
81
|
+
print(f"客户表记录数: {count}")
|
|
82
|
+
|
|
83
|
+
# 查询特定数据
|
|
84
|
+
data = con.execute("SELECT * FROM 客户表 WHERE age > 30").fetchall()
|
|
85
|
+
print(f"年龄大于30的客户: {data}")
|
|
86
|
+
finally:
|
|
87
|
+
con.close()
|
|
88
|
+
|
|
89
|
+
# 示例3: 处理目录
|
|
90
|
+
print("\n--- 示例3: 处理目录 ---")
|
|
91
|
+
with tempfile.TemporaryDirectory() as test_dir:
|
|
92
|
+
test_dir = Path(test_dir)
|
|
93
|
+
|
|
94
|
+
# 创建多个TSV文件
|
|
95
|
+
users_tsv = test_dir / 'users.tsv'
|
|
96
|
+
users_tsv.write_text(tsv_file.read_text(), encoding='utf-8')
|
|
97
|
+
|
|
98
|
+
products_tsv = test_dir / 'products.tsv'
|
|
99
|
+
products_content = """product_id\tname\tprice\tcategory\tstock
|
|
100
|
+
101\t笔记本电脑\t8999\t电子产品\t50
|
|
101
|
+
102\t无线鼠标\t199\t配件\t200
|
|
102
|
+
103\tUSB硬盘\t599\t存储\t80
|
|
103
|
+
104\t显示器\t1599\t电子产品\t30
|
|
104
|
+
105\t键盘\t299\t配件\t150"""
|
|
105
|
+
products_tsv.write_text(products_content, encoding='utf-8')
|
|
106
|
+
|
|
107
|
+
db_file = test_dir / 'directory.db'
|
|
108
|
+
con = special2db(test_dir, db_file)
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
tables = con.execute("SHOW TABLES").fetchall()
|
|
112
|
+
print(f"创建的表: {[table[0] for table in tables]}")
|
|
113
|
+
|
|
114
|
+
# 查询每个表
|
|
115
|
+
for table_name in ['users', 'products']:
|
|
116
|
+
if table_name in [t[0] for t in tables]:
|
|
117
|
+
count = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
|
118
|
+
print(f"{table_name}表记录数: {count}")
|
|
119
|
+
finally:
|
|
120
|
+
con.close()
|
|
121
|
+
|
|
122
|
+
# 示例4: 使用额外的DuckDB参数
|
|
123
|
+
print("\n--- 示例4: 使用额外的DuckDB参数 ---")
|
|
124
|
+
with tempfile.TemporaryDirectory() as test_dir:
|
|
125
|
+
test_dir = Path(test_dir)
|
|
126
|
+
db_file = test_dir / 'with_params.db'
|
|
127
|
+
|
|
128
|
+
# 为TSV文件指定编码和其他参数
|
|
129
|
+
con = special2db(tsv_file, db_file,
|
|
130
|
+
encoding='utf-8',
|
|
131
|
+
header=True)
|
|
132
|
+
|
|
133
|
+
if con:
|
|
134
|
+
try:
|
|
135
|
+
tables = con.execute("SHOW TABLES").fetchall()
|
|
136
|
+
print(f"创建的表: {[table[0] for table in tables]}")
|
|
137
|
+
finally:
|
|
138
|
+
con.close()
|
|
139
|
+
|
|
140
|
+
print("\n=== 所有示例完成 ===")
|
|
141
|
+
print("special2db 函数使用示例结束")
|
|
142
|
+
|
|
143
|
+
if __name__ == "__main__":
|
|
144
|
+
main()
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
import importlib.metadata
|
|
2
2
|
|
|
3
|
-
import simtoolsz.
|
|
3
|
+
import simtoolsz.db as db
|
|
4
4
|
import simtoolsz.mail as mail
|
|
5
5
|
import simtoolsz.utils as utils
|
|
6
6
|
import simtoolsz.datetime as datetime
|
|
7
|
+
import simtoolsz.reader as reader
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
try:
|
|
10
11
|
__version__ = importlib.metadata.version("simtoolsz")
|
|
11
12
|
except importlib.metadata.PackageNotFoundError:
|
|
12
|
-
__version__ = "0.2.
|
|
13
|
+
__version__ = "0.2.5"
|
|
13
14
|
|
|
14
15
|
__all__ = [
|
|
15
|
-
'__version__', 'mail', 'utils', 'datetime', '
|
|
16
|
+
'__version__', 'mail', 'utils', 'datetime', 'db', 'reader'
|
|
16
17
|
|
|
17
18
|
]
|
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
from typing import Optional, Dict, List, Union
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from tempfile import TemporaryDirectory
|
|
4
|
+
from zipfile import ZipFile
|
|
5
|
+
|
|
6
|
+
import duckdb
|
|
7
|
+
|
|
8
|
+
__all__ = [ 'zip2db', 'special2db', 'multizip2db' ]
|
|
9
|
+
|
|
10
|
+
def zip2db(zip_file: Path, db_file: Path,
|
|
11
|
+
filename: Optional[str] = None,
|
|
12
|
+
table: Optional[Union[Dict[str, str], List[str], str]] = None,
|
|
13
|
+
**kwargs
|
|
14
|
+
) -> duckdb.DuckDBPyConnection :
|
|
15
|
+
"""
|
|
16
|
+
读取zip中的csv、xlsx、parquet、json数据到duckdb数据库
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
zip_file: zip文件路径
|
|
20
|
+
db_file: duckdb数据库文件路径
|
|
21
|
+
filename: 指定要读取的具体文件名,如果不指定则读取所有支持的数据文件
|
|
22
|
+
table: 指定表名,可以是:
|
|
23
|
+
- dict: {文件名: 表名} 的映射
|
|
24
|
+
- list: 与文件顺序对应的表名列表
|
|
25
|
+
- str: 单个表名(仅当读取单个文件时)
|
|
26
|
+
**kwargs: 传递给duckdb读取文件的额外参数
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
duckdb连接对象
|
|
30
|
+
"""
|
|
31
|
+
with TemporaryDirectory() as tmpdir:
|
|
32
|
+
with ZipFile(zip_file, 'r') as zip_ref:
|
|
33
|
+
zip_ref.extractall(tmpdir)
|
|
34
|
+
|
|
35
|
+
tmpdir_path = Path(tmpdir)
|
|
36
|
+
|
|
37
|
+
# 获取要处理的文件列表
|
|
38
|
+
if filename:
|
|
39
|
+
# 如果指定了具体文件名
|
|
40
|
+
data_files = [tmpdir_path / filename]
|
|
41
|
+
else:
|
|
42
|
+
# 获取所有支持的数据文件
|
|
43
|
+
supported_extensions = ['*.csv', '*.xlsx', '*.parquet', '*.json']
|
|
44
|
+
data_files = []
|
|
45
|
+
for ext in supported_extensions:
|
|
46
|
+
data_files.extend(tmpdir_path.glob(ext))
|
|
47
|
+
|
|
48
|
+
if not data_files:
|
|
49
|
+
raise ValueError("未找到支持的数据文件")
|
|
50
|
+
|
|
51
|
+
# 建立数据库连接
|
|
52
|
+
con = duckdb.connect(db_file)
|
|
53
|
+
|
|
54
|
+
# 处理每个文件
|
|
55
|
+
for i, data_file in enumerate(data_files):
|
|
56
|
+
if not data_file.exists():
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
# 确定表名
|
|
60
|
+
if isinstance(table, dict):
|
|
61
|
+
# 如果table是字典,按文件名查找
|
|
62
|
+
table_name = table.get(data_file.name)
|
|
63
|
+
if not table_name:
|
|
64
|
+
# 如果字典中没有这个文件,使用文件名(不含扩展名)
|
|
65
|
+
table_name = data_file.stem
|
|
66
|
+
elif isinstance(table, list):
|
|
67
|
+
# 如果table是列表,按顺序取
|
|
68
|
+
if i < len(table):
|
|
69
|
+
table_name = table[i]
|
|
70
|
+
else:
|
|
71
|
+
table_name = data_file.stem
|
|
72
|
+
elif isinstance(table, str) and len(data_files) == 1:
|
|
73
|
+
# 如果table是字符串且只有一个文件
|
|
74
|
+
table_name = table
|
|
75
|
+
else:
|
|
76
|
+
# 默认使用文件名(不含扩展名)
|
|
77
|
+
table_name = data_file.stem
|
|
78
|
+
|
|
79
|
+
# 清理表名(移除特殊字符)
|
|
80
|
+
table_name = ''.join(c for c in table_name if c.isalnum() or c == '_')
|
|
81
|
+
|
|
82
|
+
# 根据文件扩展名选择合适的读取方式
|
|
83
|
+
suffix = data_file.suffix.lower()
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
# 构建参数字符串
|
|
87
|
+
kwargs_str = ', '.join([f"{k}='{v}'" for k, v in kwargs.items()]) if kwargs else ''
|
|
88
|
+
|
|
89
|
+
if suffix == '.csv':
|
|
90
|
+
# 读取CSV文件
|
|
91
|
+
if kwargs_str:
|
|
92
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_csv_auto('{data_file}', {kwargs_str})"
|
|
93
|
+
else:
|
|
94
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_csv_auto('{data_file}')"
|
|
95
|
+
elif suffix == '.xlsx':
|
|
96
|
+
# 读取Excel文件
|
|
97
|
+
if kwargs_str:
|
|
98
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM st_read('{data_file}', {kwargs_str})"
|
|
99
|
+
else:
|
|
100
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM st_read('{data_file}')"
|
|
101
|
+
elif suffix == '.parquet':
|
|
102
|
+
# 读取Parquet文件
|
|
103
|
+
if kwargs_str:
|
|
104
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_parquet('{data_file}', {kwargs_str})"
|
|
105
|
+
else:
|
|
106
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_parquet('{data_file}')"
|
|
107
|
+
elif suffix == '.json':
|
|
108
|
+
# 读取JSON文件
|
|
109
|
+
if kwargs_str:
|
|
110
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_json_auto('{data_file}', {kwargs_str})"
|
|
111
|
+
else:
|
|
112
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_json_auto('{data_file}')"
|
|
113
|
+
else:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
# 如果表已存在,先删除
|
|
117
|
+
con.execute(f"DROP TABLE IF EXISTS {table_name}")
|
|
118
|
+
|
|
119
|
+
# 执行读取查询
|
|
120
|
+
con.execute(read_query.strip())
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
print(f"处理文件 {data_file.name} 时出错: {e}")
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
return con
|
|
127
|
+
|
|
128
|
+
def special2db(data_path: Path, db_path: Path,
|
|
129
|
+
table: Optional[str] = None, **kwargs
|
|
130
|
+
) -> duckdb.DuckDBPyConnection :
|
|
131
|
+
"""
|
|
132
|
+
将特殊格式的文件(如tsv)转换为DuckDB数据库。
|
|
133
|
+
|
|
134
|
+
支持的文件格式:
|
|
135
|
+
- tsv: 制表符分隔的文本文件
|
|
136
|
+
- avro: Apache Avro格式文件
|
|
137
|
+
- arrow: Apache Arrow格式文件
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
data_path: 包含数据文件的路径
|
|
141
|
+
db_path: 输出的DuckDB数据库文件路径
|
|
142
|
+
table: 表名(如果是压缩包,每个文件对应一个表)
|
|
143
|
+
**kwargs: 传递给duckdb读取文件的额外参数
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
duckdb连接对象
|
|
147
|
+
|
|
148
|
+
Raises:
|
|
149
|
+
ValueError: 当找不到支持的数据文件或文件格式不支持时
|
|
150
|
+
|
|
151
|
+
Examples:
|
|
152
|
+
>>> # 读取单个TSV文件
|
|
153
|
+
>>> con = special2db('data/users.tsv', 'users.db')
|
|
154
|
+
|
|
155
|
+
>>> # 使用自定义表名
|
|
156
|
+
>>> con = special2db('data/customers.tsv', 'customers.db', table='客户表')
|
|
157
|
+
|
|
158
|
+
>>> # 处理目录中的多个文件
|
|
159
|
+
>>> con = special2db('data_directory', 'all_data.db')
|
|
160
|
+
|
|
161
|
+
>>> # 指定编码和其他参数
|
|
162
|
+
>>> con = special2db('data/data.tsv', 'output.db', encoding='utf-8', header=True)
|
|
163
|
+
|
|
164
|
+
>>> # 查询数据
|
|
165
|
+
>>> tables = con.execute("SHOW TABLES").fetchall()
|
|
166
|
+
>>> count = con.execute("SELECT COUNT(*) FROM users").fetchone()[0]
|
|
167
|
+
>>> con.close()
|
|
168
|
+
"""
|
|
169
|
+
data_path = Path(data_path)
|
|
170
|
+
db_path = Path(db_path)
|
|
171
|
+
|
|
172
|
+
# 建立数据库连接
|
|
173
|
+
con = duckdb.connect(db_path)
|
|
174
|
+
|
|
175
|
+
# 获取要处理的文件列表
|
|
176
|
+
if data_path.is_file():
|
|
177
|
+
# 如果是单个文件,检查扩展名是否支持
|
|
178
|
+
suffix = data_path.suffix.lower()
|
|
179
|
+
if suffix not in ['.tsv', '.avro', '.arrow']:
|
|
180
|
+
raise ValueError(f"不支持的文件格式: {suffix}。支持的格式: tsv, avro, arrow")
|
|
181
|
+
data_files = [data_path]
|
|
182
|
+
else:
|
|
183
|
+
# 如果是目录,获取所有支持的文件
|
|
184
|
+
supported_extensions = ['*.tsv', '*.avro', '*.arrow']
|
|
185
|
+
data_files = []
|
|
186
|
+
for ext in supported_extensions:
|
|
187
|
+
data_files.extend(data_path.glob(ext))
|
|
188
|
+
|
|
189
|
+
if not data_files:
|
|
190
|
+
raise ValueError("未找到支持的数据文件(tsv、avro、arrow)")
|
|
191
|
+
|
|
192
|
+
# 处理每个文件
|
|
193
|
+
for i, data_file in enumerate(data_files):
|
|
194
|
+
if not data_file.exists():
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
# 确定表名
|
|
198
|
+
if table and len(data_files) == 1:
|
|
199
|
+
# 如果指定了表名且只有一个文件
|
|
200
|
+
table_name = table
|
|
201
|
+
else:
|
|
202
|
+
# 默认使用文件名(不含扩展名)
|
|
203
|
+
table_name = data_file.stem
|
|
204
|
+
|
|
205
|
+
# 清理表名(移除特殊字符)
|
|
206
|
+
table_name = ''.join(c for c in table_name if c.isalnum() or c == '_')
|
|
207
|
+
|
|
208
|
+
# 根据文件扩展名选择合适的读取方式
|
|
209
|
+
suffix = data_file.suffix.lower()
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
# 构建参数字符串
|
|
213
|
+
kwargs_str = ', '.join([f"{k}='{v}'" for k, v in kwargs.items()]) if kwargs else ''
|
|
214
|
+
|
|
215
|
+
if suffix == '.tsv':
|
|
216
|
+
# 读取TSV文件(制表符分隔)
|
|
217
|
+
if kwargs_str:
|
|
218
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_csv_auto('{data_file}', delim='\\t', {kwargs_str})"
|
|
219
|
+
else:
|
|
220
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_csv_auto('{data_file}', delim='\\t')"
|
|
221
|
+
elif suffix == '.avro':
|
|
222
|
+
# 读取Avro文件
|
|
223
|
+
if kwargs_str:
|
|
224
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_avro('{data_file}', {kwargs_str})"
|
|
225
|
+
else:
|
|
226
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_avro('{data_file}')"
|
|
227
|
+
elif suffix == '.arrow':
|
|
228
|
+
# 读取Arrow文件
|
|
229
|
+
if kwargs_str:
|
|
230
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_arrow('{data_file}', {kwargs_str})"
|
|
231
|
+
else:
|
|
232
|
+
read_query = f"CREATE TABLE {table_name} AS SELECT * FROM read_arrow('{data_file}')"
|
|
233
|
+
else:
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
# 如果表已存在,先删除
|
|
237
|
+
con.execute(f"DROP TABLE IF EXISTS {table_name}")
|
|
238
|
+
|
|
239
|
+
# 执行读取查询
|
|
240
|
+
con.execute(read_query.strip())
|
|
241
|
+
|
|
242
|
+
except Exception as e:
|
|
243
|
+
print(f"处理文件 {data_file.name} 时出错: {e}")
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
return con
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def multizip2db(ziplist:list[Path], filenames:str|list[str],
|
|
250
|
+
db_path: Optional[Path] = None,
|
|
251
|
+
table:Optional[str] = None, **kwargs
|
|
252
|
+
) -> duckdb.DuckDBPyConnection:
|
|
253
|
+
"""
|
|
254
|
+
将多个压缩包中指定的文件的数据合并后转换到DuckDB数据库中。
|
|
255
|
+
主要支持的数据:tsv、csv、xlsx、parquet、json。
|
|
256
|
+
|
|
257
|
+
注意:
|
|
258
|
+
1. 每个压缩包中的文件会被合并到一个表中。
|
|
259
|
+
2. 如果指定了表名,所有数据将合并到该表中。
|
|
260
|
+
3. 如果未指定表名,每个文件将使用其文件名(不含扩展名)作为表名。
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
ziplist: 包含压缩包路径的列表
|
|
264
|
+
filenames: 要处理的文件名(支持通配符)
|
|
265
|
+
table: 可选的表名,默认使用文件名(不含扩展名)
|
|
266
|
+
kwargs: 传递给special2db的其他参数
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
duckdb连接对象
|
|
270
|
+
"""
|
|
271
|
+
# 确保filenames是列表
|
|
272
|
+
if isinstance(filenames, str):
|
|
273
|
+
filenames = [filenames]
|
|
274
|
+
|
|
275
|
+
# 创建数据库连接
|
|
276
|
+
if db_path is None:
|
|
277
|
+
db_path = ":memory:"
|
|
278
|
+
else:
|
|
279
|
+
db_path = Path(db_path)
|
|
280
|
+
con = duckdb.connect(db_path)
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
with TemporaryDirectory() as tmpdir:
|
|
284
|
+
tmpdir_path = Path(tmpdir)
|
|
285
|
+
|
|
286
|
+
# 处理每个压缩包
|
|
287
|
+
for zip_path in ziplist:
|
|
288
|
+
if not zip_path.exists():
|
|
289
|
+
print(f"压缩包不存在: {zip_path}")
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
with ZipFile(zip_path, 'r') as zip_ref:
|
|
293
|
+
# 提取压缩包内容
|
|
294
|
+
zip_ref.extractall(tmpdir_path)
|
|
295
|
+
|
|
296
|
+
# 查找匹配的文件
|
|
297
|
+
for filename_pattern in filenames:
|
|
298
|
+
# 支持通配符匹配
|
|
299
|
+
matched_files = list(tmpdir_path.glob(filename_pattern))
|
|
300
|
+
|
|
301
|
+
for data_file in matched_files:
|
|
302
|
+
if not data_file.is_file():
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
# 确定表名
|
|
306
|
+
if table:
|
|
307
|
+
table_name = table
|
|
308
|
+
else:
|
|
309
|
+
table_name = data_file.stem
|
|
310
|
+
|
|
311
|
+
# 清理表名(移除特殊字符)
|
|
312
|
+
table_name = ''.join(c for c in table_name if c.isalnum() or c == '_')
|
|
313
|
+
|
|
314
|
+
# 根据文件扩展名选择合适的读取方式
|
|
315
|
+
suffix = data_file.suffix.lower()
|
|
316
|
+
|
|
317
|
+
try:
|
|
318
|
+
# 构建参数字符串
|
|
319
|
+
kwargs_str = ', '.join([f"{k}='{v}'" for k, v in kwargs.items()]) if kwargs else ''
|
|
320
|
+
|
|
321
|
+
# 构建读取查询
|
|
322
|
+
if suffix == '.csv':
|
|
323
|
+
if kwargs_str:
|
|
324
|
+
read_query = f"SELECT * FROM read_csv_auto('{data_file}', {kwargs_str})"
|
|
325
|
+
else:
|
|
326
|
+
read_query = f"SELECT * FROM read_csv_auto('{data_file}')"
|
|
327
|
+
elif suffix == '.xlsx':
|
|
328
|
+
if kwargs_str:
|
|
329
|
+
read_query = f"SELECT * FROM st_read('{data_file}', {kwargs_str})"
|
|
330
|
+
else:
|
|
331
|
+
read_query = f"SELECT * FROM st_read('{data_file}')"
|
|
332
|
+
elif suffix == '.parquet':
|
|
333
|
+
if kwargs_str:
|
|
334
|
+
read_query = f"SELECT * FROM read_parquet('{data_file}', {kwargs_str})"
|
|
335
|
+
else:
|
|
336
|
+
read_query = f"SELECT * FROM read_parquet('{data_file}')"
|
|
337
|
+
elif suffix == '.json':
|
|
338
|
+
if kwargs_str:
|
|
339
|
+
read_query = f"SELECT * FROM read_json_auto('{data_file}', {kwargs_str})"
|
|
340
|
+
else:
|
|
341
|
+
read_query = f"SELECT * FROM read_json_auto('{data_file}')"
|
|
342
|
+
elif suffix == '.tsv':
|
|
343
|
+
if kwargs_str:
|
|
344
|
+
read_query = f"SELECT * FROM read_csv_auto('{data_file}', delim='\\t', {kwargs_str})"
|
|
345
|
+
else:
|
|
346
|
+
read_query = f"SELECT * FROM read_csv_auto('{data_file}', delim='\\t')"
|
|
347
|
+
else:
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
# 检查表是否已存在
|
|
351
|
+
existing_tables = con.execute("SHOW TABLES").fetchall()
|
|
352
|
+
table_exists = any(table_name == t[0] for t in existing_tables)
|
|
353
|
+
|
|
354
|
+
if table_exists:
|
|
355
|
+
# 如果表已存在,插入数据
|
|
356
|
+
con.execute(f"INSERT INTO {table_name} {read_query}")
|
|
357
|
+
else:
|
|
358
|
+
# 如果表不存在,创建新表
|
|
359
|
+
con.execute(f"CREATE TABLE {table_name} AS {read_query}")
|
|
360
|
+
|
|
361
|
+
except Exception as e:
|
|
362
|
+
print(f"处理文件 {data_file.name} 时出错: {e}")
|
|
363
|
+
continue
|
|
364
|
+
|
|
365
|
+
# 清理已处理的文件,避免重复处理
|
|
366
|
+
data_file.unlink(missing_ok=True)
|
|
367
|
+
|
|
368
|
+
except Exception as e:
|
|
369
|
+
print(f"处理压缩包时出错: {e}")
|
|
370
|
+
raise
|
|
371
|
+
|
|
372
|
+
return con
|
|
373
|
+
|