tamar-file-hub-client 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,353 +1,368 @@
1
- """
2
- Taple 服务基类
3
- """
4
- import csv
5
- import tempfile
6
- import os
7
- from pathlib import Path
8
- from typing import Any, Dict, List, Optional, Union, Iterator, Callable, Tuple
9
- try:
10
- import openpyxl
11
- except ImportError:
12
- openpyxl = None
13
-
14
- from ...schemas.taple import (
15
- Table, Sheet, Column, Row, Cell, MergedCell, TableView,
16
- CellUpdate
17
- )
18
- from ...utils.converter import timestamp_to_datetime
19
- from .idempotent_taple_mixin import IdempotentTapleMixin
20
- from ...errors import ValidationError
21
-
22
-
23
- class BaseTapleService(IdempotentTapleMixin):
24
- """
25
- Taple 服务基类,提供通用的数据转换方法
26
- """
27
-
28
- def _convert_table(self, proto_table: Any) -> Table:
29
- """转换 Proto Table 为模型"""
30
- return Table(
31
- id=proto_table.id,
32
- file_id=proto_table.file_id,
33
- org_id=proto_table.org_id,
34
- user_id=proto_table.user_id,
35
- name=proto_table.name if proto_table.name else None,
36
- description=proto_table.description if proto_table.description else None,
37
- created_by_role=proto_table.created_by_role,
38
- created_by=proto_table.created_by,
39
- created_at=timestamp_to_datetime(proto_table.created_at),
40
- updated_at=timestamp_to_datetime(proto_table.updated_at),
41
- deleted_at=timestamp_to_datetime(proto_table.deleted_at) if proto_table.deleted_at else None,
42
- )
43
-
44
- def _convert_sheet(self, proto_sheet: Any) -> Sheet:
45
- """转换 Proto Sheet 为模型"""
46
- return Sheet(
47
- id=proto_sheet.id,
48
- table_id=proto_sheet.table_id,
49
- org_id=proto_sheet.org_id,
50
- user_id=proto_sheet.user_id,
51
- name=proto_sheet.name,
52
- description=proto_sheet.description if proto_sheet.description else None,
53
- position=proto_sheet.position,
54
- version=proto_sheet.version,
55
- created_by_role=proto_sheet.created_by_role,
56
- created_by=proto_sheet.created_by,
57
- created_at=timestamp_to_datetime(proto_sheet.created_at),
58
- updated_at=timestamp_to_datetime(proto_sheet.updated_at),
59
- deleted_at=timestamp_to_datetime(proto_sheet.deleted_at) if proto_sheet.deleted_at else None
60
- )
61
-
62
- def _convert_column(self, proto_column: Any) -> Column:
63
- """转换 Proto Column 为模型"""
64
- properties = None
65
- if proto_column.properties:
66
- from google.protobuf.json_format import MessageToDict
67
- properties = MessageToDict(proto_column.properties)
68
-
69
- return Column(
70
- id=proto_column.id,
71
- sheet_id=proto_column.sheet_id,
72
- org_id=proto_column.org_id,
73
- user_id=proto_column.user_id,
74
- column_key=proto_column.column_key,
75
- name=proto_column.name,
76
- column_type=proto_column.column_type,
77
- description=proto_column.description if proto_column.description else None,
78
- position=proto_column.position,
79
- width=proto_column.width if proto_column.width else None,
80
- hidden=proto_column.hidden if proto_column.hidden else None,
81
- properties=properties,
82
- version=proto_column.version,
83
- created_by_role=proto_column.created_by_role,
84
- created_by=proto_column.created_by,
85
- created_at=timestamp_to_datetime(proto_column.created_at),
86
- updated_at=timestamp_to_datetime(proto_column.updated_at),
87
- deleted_at=timestamp_to_datetime(proto_column.deleted_at) if proto_column.deleted_at else None
88
- )
89
-
90
- def _convert_row(self, proto_row: Any) -> Row:
91
- """转换 Proto Row 为模型"""
92
- return Row(
93
- id=proto_row.id,
94
- sheet_id=proto_row.sheet_id,
95
- org_id=proto_row.org_id,
96
- user_id=proto_row.user_id,
97
- row_key=proto_row.row_key,
98
- position=proto_row.position,
99
- height=proto_row.height if proto_row.height else None,
100
- hidden=proto_row.hidden if proto_row.hidden else None,
101
- version=proto_row.version,
102
- created_by_role=proto_row.created_by_role,
103
- created_by=proto_row.created_by,
104
- created_at=timestamp_to_datetime(proto_row.created_at),
105
- updated_at=timestamp_to_datetime(proto_row.updated_at),
106
- deleted_at=timestamp_to_datetime(proto_row.deleted_at) if proto_row.deleted_at else None
107
- )
108
-
109
- def _convert_cell(self, proto_cell: Any) -> Cell:
110
- """转换 Proto Cell 为模型"""
111
- styles = None
112
- if proto_cell.styles:
113
- from google.protobuf.json_format import MessageToDict
114
- styles = MessageToDict(proto_cell.styles)
115
-
116
- return Cell(
117
- id=proto_cell.id,
118
- sheet_id=proto_cell.sheet_id,
119
- column_id=proto_cell.column_id,
120
- row_id=proto_cell.row_id,
121
- org_id=proto_cell.org_id,
122
- user_id=proto_cell.user_id,
123
- column_key=proto_cell.column_key,
124
- row_key=proto_cell.row_key,
125
- raw_value=proto_cell.raw_value if proto_cell.raw_value else None,
126
- formatted_value=proto_cell.formatted_value if proto_cell.formatted_value else None,
127
- formula=proto_cell.formula if proto_cell.formula else None,
128
- styles=styles,
129
- data_type=proto_cell.data_type if proto_cell.data_type else None,
130
- version=proto_cell.version,
131
- created_by_role=proto_cell.created_by_role,
132
- created_by=proto_cell.created_by,
133
- created_at=timestamp_to_datetime(proto_cell.created_at),
134
- updated_at=timestamp_to_datetime(proto_cell.updated_at),
135
- deleted_at=timestamp_to_datetime(proto_cell.deleted_at) if proto_cell.deleted_at else None
136
- )
137
-
138
- def _convert_merged_cell(self, proto_merged_cell: Any) -> MergedCell:
139
- """转换 Proto MergedCell 为模型"""
140
- return MergedCell(
141
- id=proto_merged_cell.id,
142
- sheet_id=proto_merged_cell.sheet_id,
143
- org_id=proto_merged_cell.org_id,
144
- user_id=proto_merged_cell.user_id,
145
- start_column_id=proto_merged_cell.start_column_id,
146
- end_column_id=proto_merged_cell.end_column_id,
147
- start_row_id=proto_merged_cell.start_row_id,
148
- end_row_id=proto_merged_cell.end_row_id,
149
- created_at=timestamp_to_datetime(proto_merged_cell.created_at),
150
- updated_at=timestamp_to_datetime(proto_merged_cell.updated_at),
151
- deleted_at=timestamp_to_datetime(proto_merged_cell.deleted_at) if proto_merged_cell.deleted_at else None
152
- )
153
-
154
- def _convert_table_view(self, proto_view: Any) -> TableView:
155
- """转换 Proto TableView 为模型"""
156
- from google.protobuf.json_format import MessageToDict
157
- config = MessageToDict(proto_view.config)
158
-
159
- return TableView(
160
- id=proto_view.id,
161
- table_id=proto_view.table_id,
162
- sheet_id=proto_view.sheet_id,
163
- org_id=proto_view.org_id,
164
- user_id=proto_view.user_id,
165
- name=proto_view.name,
166
- view_type=proto_view.view_type,
167
- config=config,
168
- created_at=timestamp_to_datetime(proto_view.created_at),
169
- updated_at=timestamp_to_datetime(proto_view.updated_at),
170
- deleted_at=timestamp_to_datetime(proto_view.deleted_at) if proto_view.deleted_at else None
171
- )
172
-
173
- def _convert_dict_to_struct(self, data: Dict[str, Any]) -> Any:
174
- """转换字典为 Proto Struct"""
175
- from google.protobuf.struct_pb2 import Struct
176
- from google.protobuf.json_format import ParseDict
177
-
178
- struct = Struct()
179
- ParseDict(data, struct)
180
- return struct
181
-
182
- def _convert_cell_updates_to_proto(self, updates: List[CellUpdate]) -> List[Any]:
183
- """转换 CellUpdate 列表为 Proto 格式"""
184
- from ...rpc.gen import taple_service_pb2
185
-
186
- proto_updates = []
187
- for update in updates:
188
- proto_update = taple_service_pb2.CellUpdate(
189
- column_key=update.column_key,
190
- row_key=update.row_key
191
- )
192
-
193
- if update.raw_value is not None:
194
- proto_update.raw_value = update.raw_value
195
- if update.formula is not None:
196
- proto_update.formula = update.formula
197
- if update.styles is not None:
198
- proto_update.styles.CopyFrom(self._convert_dict_to_struct(update.styles))
199
-
200
- proto_updates.append(proto_update)
201
-
202
- return proto_updates
203
-
204
- def _is_file_id(self, source: Union[str, Path]) -> bool:
205
- """判断source是否为file_id"""
206
- if isinstance(source, Path):
207
- return False
208
- # file_id 通常是UUID格式或特定格式的ID
209
- source_str = str(source)
210
- # 检查是否是文件路径
211
- if '/' in source_str or '\\' in source_str or os.path.exists(source_str):
212
- return False
213
- # 简单判断:如果长度合适且不包含文件扩展名,可能是file_id
214
- return len(source_str) > 10 and not source_str.endswith(('.csv', '.xlsx', '.xls'))
215
-
216
- def _detect_file_format(self, file_path: Union[str, Path]) -> str:
217
- """自动检测文件格式"""
218
- path = Path(file_path)
219
- extension = path.suffix.lower()
220
- if extension in ['.csv', '.tsv']:
221
- return 'csv'
222
- elif extension in ['.xlsx', '.xls']:
223
- return 'excel'
224
- else:
225
- # 尝试通过内容判断
226
- with open(path, 'rb') as f:
227
- header = f.read(8)
228
- if header.startswith(b'PK'): # ZIP文件头,Excel文件
229
- return 'excel'
230
- return 'csv' # 默认CSV
231
-
232
- def _read_csv_stream(
233
- self,
234
- file_path: Union[str, Path],
235
- encoding: str = 'utf-8',
236
- batch_size: int = 1000,
237
- delimiter: str = ','
238
- ) -> Iterator[Dict[str, Any]]:
239
- """流式读取CSV文件"""
240
- with open(file_path, 'r', encoding=encoding, errors='replace') as f:
241
- # 尝试检测分隔符
242
- sample = f.read(1024)
243
- f.seek(0)
244
- sniffer = csv.Sniffer()
245
- try:
246
- detected_delimiter = sniffer.sniff(sample).delimiter
247
- delimiter = detected_delimiter
248
- except:
249
- pass # 使用默认分隔符
250
-
251
- reader = csv.DictReader(f, delimiter=delimiter)
252
- columns = reader.fieldnames
253
-
254
- if not columns:
255
- raise ValidationError("CSV文件没有列头")
256
-
257
- # 返回列定义
258
- yield {'type': 'columns', 'data': columns}
259
-
260
- # 批量返回行数据
261
- batch = []
262
- row_count = 0
263
- for row in reader:
264
- batch.append(row)
265
- row_count += 1
266
- if len(batch) >= batch_size:
267
- yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
268
- batch = []
269
-
270
- if batch:
271
- yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
272
-
273
- def _read_excel_stream(
274
- self,
275
- file_path: Union[str, Path],
276
- batch_size: int = 1000,
277
- sheet_mapping: Optional[Dict[str, str]] = None
278
- ) -> Iterator[Dict[str, Any]]:
279
- """流式读取Excel文件"""
280
- if openpyxl is None:
281
- raise ImportError("需要安装openpyxl库来处理Excel文件: pip install openpyxl")
282
-
283
- wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
284
-
285
- for sheet_name in wb.sheetnames:
286
- if sheet_mapping and sheet_name not in sheet_mapping:
287
- continue
288
-
289
- ws = wb[sheet_name]
290
- rows = ws.iter_rows(values_only=True)
291
-
292
- # 获取第一行作为列头
293
- headers = None
294
- for row in rows:
295
- if row and any(cell is not None for cell in row):
296
- headers = [str(cell) if cell is not None else f"Column_{i+1}"
297
- for i, cell in enumerate(row)]
298
- break
299
-
300
- if not headers:
301
- continue
302
-
303
- # 返回工作表信息
304
- target_sheet_name = sheet_mapping.get(sheet_name, sheet_name) if sheet_mapping else sheet_name
305
- yield {
306
- 'type': 'sheet',
307
- 'name': target_sheet_name,
308
- 'columns': headers,
309
- 'original_name': sheet_name
310
- }
311
-
312
- # 批量返回行数据
313
- batch = []
314
- row_count = 0
315
- for row in rows:
316
- if row and any(cell is not None for cell in row):
317
- row_dict = {}
318
- for i, (header, value) in enumerate(zip(headers, row)):
319
- if value is not None:
320
- row_dict[header] = value
321
- if row_dict: # 只添加非空行
322
- batch.append(row_dict)
323
- row_count += 1
324
- if len(batch) >= batch_size:
325
- yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
326
- batch = []
327
-
328
- if batch:
329
- yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
330
-
331
- wb.close()
332
-
333
- def _infer_column_type(self, values: List[Any]) -> str:
334
- """推断列类型"""
335
- if not values:
336
- return 'text'
337
-
338
- # 检查数值类型
339
- numeric_count = 0
340
- for value in values[:100]: # 只检查前100个值
341
- if value is None or str(value).strip() == '':
342
- continue
343
- try:
344
- float(str(value))
345
- numeric_count += 1
346
- except:
347
- pass
348
-
349
- if numeric_count > len(values) * 0.8: # 80%以上是数字
350
- return 'number'
351
-
352
- # 默认为文本
1
+ """
2
+ Taple 服务基类
3
+ """
4
+ import csv
5
+ import tempfile
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Union, Iterator, Callable, Tuple
9
+ try:
10
+ import openpyxl
11
+ except ImportError:
12
+ openpyxl = None
13
+
14
+ from ...schemas.taple import (
15
+ Table, Sheet, Column, Row, Cell, MergedCell, TableView,
16
+ CellUpdate
17
+ )
18
+ from ...utils.converter import timestamp_to_datetime
19
+ from .idempotent_taple_mixin import IdempotentTapleMixin
20
+ from ...errors import ValidationError
21
+
22
+
23
+ class BaseTapleService(IdempotentTapleMixin):
24
+ """
25
+ Taple 服务基类,提供通用的数据转换方法
26
+ """
27
+
28
+ def _convert_table(self, proto_table: Any) -> Table:
29
+ """转换 Proto Table 为模型"""
30
+ return Table(
31
+ id=proto_table.id,
32
+ file_id=proto_table.file_id,
33
+ org_id=proto_table.org_id,
34
+ user_id=proto_table.user_id,
35
+ name=proto_table.name if proto_table.name else None,
36
+ description=proto_table.description if proto_table.description else None,
37
+ created_by_role=proto_table.created_by_role,
38
+ created_by=proto_table.created_by,
39
+ created_at=timestamp_to_datetime(proto_table.created_at),
40
+ updated_at=timestamp_to_datetime(proto_table.updated_at),
41
+ deleted_at=timestamp_to_datetime(proto_table.deleted_at) if proto_table.deleted_at else None,
42
+ )
43
+
44
+ def _convert_sheet(self, proto_sheet: Any) -> Sheet:
45
+ """转换 Proto Sheet 为模型"""
46
+ return Sheet(
47
+ id=proto_sheet.id,
48
+ table_id=proto_sheet.table_id,
49
+ org_id=proto_sheet.org_id,
50
+ user_id=proto_sheet.user_id,
51
+ name=proto_sheet.name,
52
+ description=proto_sheet.description if proto_sheet.description else None,
53
+ position=proto_sheet.position,
54
+ version=proto_sheet.version,
55
+ created_by_role=proto_sheet.created_by_role,
56
+ created_by=proto_sheet.created_by,
57
+ created_at=timestamp_to_datetime(proto_sheet.created_at),
58
+ updated_at=timestamp_to_datetime(proto_sheet.updated_at),
59
+ deleted_at=timestamp_to_datetime(proto_sheet.deleted_at) if proto_sheet.deleted_at else None
60
+ )
61
+
62
+ def _convert_column(self, proto_column: Any) -> Column:
63
+ """转换 Proto Column 为模型"""
64
+ properties = None
65
+ if proto_column.properties:
66
+ from google.protobuf.json_format import MessageToDict
67
+ properties = MessageToDict(proto_column.properties)
68
+
69
+ return Column(
70
+ id=proto_column.id,
71
+ sheet_id=proto_column.sheet_id,
72
+ org_id=proto_column.org_id,
73
+ user_id=proto_column.user_id,
74
+ column_key=proto_column.column_key,
75
+ name=proto_column.name,
76
+ column_type=proto_column.column_type,
77
+ description=proto_column.description if proto_column.description else None,
78
+ position=proto_column.position,
79
+ width=proto_column.width if proto_column.width else None,
80
+ hidden=proto_column.hidden if proto_column.hidden else None,
81
+ properties=properties,
82
+ version=proto_column.version,
83
+ created_by_role=proto_column.created_by_role,
84
+ created_by=proto_column.created_by,
85
+ created_at=timestamp_to_datetime(proto_column.created_at),
86
+ updated_at=timestamp_to_datetime(proto_column.updated_at),
87
+ deleted_at=timestamp_to_datetime(proto_column.deleted_at) if proto_column.deleted_at else None
88
+ )
89
+
90
+ def _convert_row(self, proto_row: Any) -> Row:
91
+ """转换 Proto Row 为模型"""
92
+ return Row(
93
+ id=proto_row.id,
94
+ sheet_id=proto_row.sheet_id,
95
+ org_id=proto_row.org_id,
96
+ user_id=proto_row.user_id,
97
+ row_key=proto_row.row_key,
98
+ position=proto_row.position,
99
+ height=proto_row.height if proto_row.height else None,
100
+ hidden=proto_row.hidden if proto_row.hidden else None,
101
+ version=proto_row.version,
102
+ created_by_role=proto_row.created_by_role,
103
+ created_by=proto_row.created_by,
104
+ created_at=timestamp_to_datetime(proto_row.created_at),
105
+ updated_at=timestamp_to_datetime(proto_row.updated_at),
106
+ deleted_at=timestamp_to_datetime(proto_row.deleted_at) if proto_row.deleted_at else None
107
+ )
108
+
109
+ def _convert_cell(self, proto_cell: Any) -> Cell:
110
+ """转换 Proto Cell 为模型"""
111
+ styles = None
112
+ if proto_cell.styles:
113
+ from google.protobuf.json_format import MessageToDict
114
+ styles = MessageToDict(proto_cell.styles)
115
+
116
+ return Cell(
117
+ id=proto_cell.id,
118
+ sheet_id=proto_cell.sheet_id,
119
+ column_id=proto_cell.column_id,
120
+ row_id=proto_cell.row_id,
121
+ org_id=proto_cell.org_id,
122
+ user_id=proto_cell.user_id,
123
+ column_key=proto_cell.column_key,
124
+ row_key=proto_cell.row_key,
125
+ raw_value=proto_cell.raw_value if proto_cell.raw_value else None,
126
+ formatted_value=proto_cell.formatted_value if proto_cell.formatted_value else None,
127
+ formula=proto_cell.formula if proto_cell.formula else None,
128
+ styles=styles,
129
+ data_type=proto_cell.data_type if proto_cell.data_type else None,
130
+ version=proto_cell.version,
131
+ created_by_role=proto_cell.created_by_role,
132
+ created_by=proto_cell.created_by,
133
+ created_at=timestamp_to_datetime(proto_cell.created_at),
134
+ updated_at=timestamp_to_datetime(proto_cell.updated_at),
135
+ deleted_at=timestamp_to_datetime(proto_cell.deleted_at) if proto_cell.deleted_at else None
136
+ )
137
+
138
+ def _convert_merged_cell(self, proto_merged_cell: Any) -> MergedCell:
139
+ """转换 Proto MergedCell 为模型"""
140
+ return MergedCell(
141
+ id=proto_merged_cell.id,
142
+ sheet_id=proto_merged_cell.sheet_id,
143
+ org_id=proto_merged_cell.org_id,
144
+ user_id=proto_merged_cell.user_id,
145
+ start_column_id=proto_merged_cell.start_column_id,
146
+ end_column_id=proto_merged_cell.end_column_id,
147
+ start_row_id=proto_merged_cell.start_row_id,
148
+ end_row_id=proto_merged_cell.end_row_id,
149
+ created_at=timestamp_to_datetime(proto_merged_cell.created_at),
150
+ updated_at=timestamp_to_datetime(proto_merged_cell.updated_at),
151
+ deleted_at=timestamp_to_datetime(proto_merged_cell.deleted_at) if proto_merged_cell.deleted_at else None
152
+ )
153
+
154
+ def _convert_table_view(self, proto_view: Any) -> TableView:
155
+ """转换 Proto TableView 为模型"""
156
+ from google.protobuf.json_format import MessageToDict
157
+
158
+ # 转换配置字段
159
+ config = MessageToDict(proto_view.config) if proto_view.HasField('config') else {}
160
+ filter_criteria = MessageToDict(proto_view.filter_criteria) if proto_view.HasField('filter_criteria') else None
161
+ sort_criteria = MessageToDict(proto_view.sort_criteria) if proto_view.HasField('sort_criteria') else None
162
+ visible_columns = MessageToDict(proto_view.visible_columns) if proto_view.HasField('visible_columns') else None
163
+ group_criteria = MessageToDict(proto_view.group_criteria) if proto_view.HasField('group_criteria') else None
164
+
165
+ return TableView(
166
+ id=proto_view.id,
167
+ table_id=proto_view.table_id,
168
+ sheet_id=proto_view.sheet_id,
169
+ org_id=proto_view.org_id,
170
+ user_id=proto_view.user_id,
171
+ file_id=proto_view.file_id,
172
+ filter_criteria=filter_criteria,
173
+ sort_criteria=sort_criteria,
174
+ visible_columns=visible_columns,
175
+ group_criteria=group_criteria,
176
+ created_by_role=proto_view.created_by_role,
177
+ created_by=proto_view.created_by,
178
+ view_name=proto_view.view_name,
179
+ view_type=proto_view.view_type,
180
+ is_hidden=proto_view.is_hidden,
181
+ is_default=proto_view.is_default,
182
+ config=config,
183
+ created_at=timestamp_to_datetime(proto_view.created_at),
184
+ updated_at=timestamp_to_datetime(proto_view.updated_at),
185
+ deleted_at=timestamp_to_datetime(proto_view.deleted_at) if proto_view.HasField('deleted_at') else None
186
+ )
187
+
188
+ def _convert_dict_to_struct(self, data: Dict[str, Any]) -> Any:
189
+ """转换字典为 Proto Struct"""
190
+ from google.protobuf.struct_pb2 import Struct
191
+ from google.protobuf.json_format import ParseDict
192
+
193
+ struct = Struct()
194
+ ParseDict(data, struct)
195
+ return struct
196
+
197
+ def _convert_cell_updates_to_proto(self, updates: List[CellUpdate]) -> List[Any]:
198
+ """转换 CellUpdate 列表为 Proto 格式"""
199
+ from ...rpc.gen import taple_service_pb2
200
+
201
+ proto_updates = []
202
+ for update in updates:
203
+ proto_update = taple_service_pb2.CellUpdate(
204
+ column_key=update.column_key,
205
+ row_key=update.row_key
206
+ )
207
+
208
+ if update.raw_value is not None:
209
+ proto_update.raw_value = update.raw_value
210
+ if update.formula is not None:
211
+ proto_update.formula = update.formula
212
+ if update.styles is not None:
213
+ proto_update.styles.CopyFrom(self._convert_dict_to_struct(update.styles))
214
+
215
+ proto_updates.append(proto_update)
216
+
217
+ return proto_updates
218
+
219
+ def _is_file_id(self, source: Union[str, Path]) -> bool:
220
+ """判断source是否为file_id"""
221
+ if isinstance(source, Path):
222
+ return False
223
+ # file_id 通常是UUID格式或特定格式的ID
224
+ source_str = str(source)
225
+ # 检查是否是文件路径
226
+ if '/' in source_str or '\\' in source_str or os.path.exists(source_str):
227
+ return False
228
+ # 简单判断:如果长度合适且不包含文件扩展名,可能是file_id
229
+ return len(source_str) > 10 and not source_str.endswith(('.csv', '.xlsx', '.xls'))
230
+
231
+ def _detect_file_format(self, file_path: Union[str, Path]) -> str:
232
+ """自动检测文件格式"""
233
+ path = Path(file_path)
234
+ extension = path.suffix.lower()
235
+ if extension in ['.csv', '.tsv']:
236
+ return 'csv'
237
+ elif extension in ['.xlsx', '.xls']:
238
+ return 'excel'
239
+ else:
240
+ # 尝试通过内容判断
241
+ with open(path, 'rb') as f:
242
+ header = f.read(8)
243
+ if header.startswith(b'PK'): # ZIP文件头,Excel文件
244
+ return 'excel'
245
+ return 'csv' # 默认CSV
246
+
247
+ def _read_csv_stream(
248
+ self,
249
+ file_path: Union[str, Path],
250
+ encoding: str = 'utf-8',
251
+ batch_size: int = 1000,
252
+ delimiter: str = ','
253
+ ) -> Iterator[Dict[str, Any]]:
254
+ """流式读取CSV文件"""
255
+ with open(file_path, 'r', encoding=encoding, errors='replace') as f:
256
+ # 尝试检测分隔符
257
+ sample = f.read(1024)
258
+ f.seek(0)
259
+ sniffer = csv.Sniffer()
260
+ try:
261
+ detected_delimiter = sniffer.sniff(sample).delimiter
262
+ delimiter = detected_delimiter
263
+ except:
264
+ pass # 使用默认分隔符
265
+
266
+ reader = csv.DictReader(f, delimiter=delimiter)
267
+ columns = reader.fieldnames
268
+
269
+ if not columns:
270
+ raise ValidationError("CSV文件没有列头")
271
+
272
+ # 返回列定义
273
+ yield {'type': 'columns', 'data': columns}
274
+
275
+ # 批量返回行数据
276
+ batch = []
277
+ row_count = 0
278
+ for row in reader:
279
+ batch.append(row)
280
+ row_count += 1
281
+ if len(batch) >= batch_size:
282
+ yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
283
+ batch = []
284
+
285
+ if batch:
286
+ yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
287
+
288
+ def _read_excel_stream(
289
+ self,
290
+ file_path: Union[str, Path],
291
+ batch_size: int = 1000,
292
+ sheet_mapping: Optional[Dict[str, str]] = None
293
+ ) -> Iterator[Dict[str, Any]]:
294
+ """流式读取Excel文件"""
295
+ if openpyxl is None:
296
+ raise ImportError("需要安装openpyxl库来处理Excel文件: pip install openpyxl")
297
+
298
+ wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
299
+
300
+ for sheet_name in wb.sheetnames:
301
+ if sheet_mapping and sheet_name not in sheet_mapping:
302
+ continue
303
+
304
+ ws = wb[sheet_name]
305
+ rows = ws.iter_rows(values_only=True)
306
+
307
+ # 获取第一行作为列头
308
+ headers = None
309
+ for row in rows:
310
+ if row and any(cell is not None for cell in row):
311
+ headers = [str(cell) if cell is not None else f"Column_{i+1}"
312
+ for i, cell in enumerate(row)]
313
+ break
314
+
315
+ if not headers:
316
+ continue
317
+
318
+ # 返回工作表信息
319
+ target_sheet_name = sheet_mapping.get(sheet_name, sheet_name) if sheet_mapping else sheet_name
320
+ yield {
321
+ 'type': 'sheet',
322
+ 'name': target_sheet_name,
323
+ 'columns': headers,
324
+ 'original_name': sheet_name
325
+ }
326
+
327
+ # 批量返回行数据
328
+ batch = []
329
+ row_count = 0
330
+ for row in rows:
331
+ if row and any(cell is not None for cell in row):
332
+ row_dict = {}
333
+ for i, (header, value) in enumerate(zip(headers, row)):
334
+ if value is not None:
335
+ row_dict[header] = value
336
+ if row_dict: # 只添加非空行
337
+ batch.append(row_dict)
338
+ row_count += 1
339
+ if len(batch) >= batch_size:
340
+ yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
341
+ batch = []
342
+
343
+ if batch:
344
+ yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
345
+
346
+ wb.close()
347
+
348
+ def _infer_column_type(self, values: List[Any]) -> str:
349
+ """推断列类型"""
350
+ if not values:
351
+ return 'text'
352
+
353
+ # 检查数值类型
354
+ numeric_count = 0
355
+ for value in values[:100]: # 只检查前100个值
356
+ if value is None or str(value).strip() == '':
357
+ continue
358
+ try:
359
+ float(str(value))
360
+ numeric_count += 1
361
+ except:
362
+ pass
363
+
364
+ if numeric_count > len(values) * 0.8: # 80%以上是数字
365
+ return 'number'
366
+
367
+ # 默认为文本
353
368
  return 'text'