tamar-file-hub-client 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,353 +1,380 @@
1
- """
2
- Taple 服务基类
3
- """
4
- import csv
5
- import tempfile
6
- import os
7
- from pathlib import Path
8
- from typing import Any, Dict, List, Optional, Union, Iterator, Callable, Tuple
9
- try:
10
- import openpyxl
11
- except ImportError:
12
- openpyxl = None
13
-
14
- from ...schemas.taple import (
15
- Table, Sheet, Column, Row, Cell, MergedCell, TableView,
16
- CellUpdate
17
- )
18
- from ...utils.converter import timestamp_to_datetime
19
- from .idempotent_taple_mixin import IdempotentTapleMixin
20
- from ...errors import ValidationError
21
-
22
-
23
- class BaseTapleService(IdempotentTapleMixin):
24
- """
25
- Taple 服务基类,提供通用的数据转换方法
26
- """
27
-
28
- def _convert_table(self, proto_table: Any) -> Table:
29
- """转换 Proto Table 为模型"""
30
- return Table(
31
- id=proto_table.id,
32
- file_id=proto_table.file_id,
33
- org_id=proto_table.org_id,
34
- user_id=proto_table.user_id,
35
- name=proto_table.name if proto_table.name else None,
36
- description=proto_table.description if proto_table.description else None,
37
- created_by_role=proto_table.created_by_role,
38
- created_by=proto_table.created_by,
39
- created_at=timestamp_to_datetime(proto_table.created_at),
40
- updated_at=timestamp_to_datetime(proto_table.updated_at),
41
- deleted_at=timestamp_to_datetime(proto_table.deleted_at) if proto_table.deleted_at else None,
42
- )
43
-
44
- def _convert_sheet(self, proto_sheet: Any) -> Sheet:
45
- """转换 Proto Sheet 为模型"""
46
- return Sheet(
47
- id=proto_sheet.id,
48
- table_id=proto_sheet.table_id,
49
- org_id=proto_sheet.org_id,
50
- user_id=proto_sheet.user_id,
51
- name=proto_sheet.name,
52
- description=proto_sheet.description if proto_sheet.description else None,
53
- position=proto_sheet.position,
54
- version=proto_sheet.version,
55
- created_by_role=proto_sheet.created_by_role,
56
- created_by=proto_sheet.created_by,
57
- created_at=timestamp_to_datetime(proto_sheet.created_at),
58
- updated_at=timestamp_to_datetime(proto_sheet.updated_at),
59
- deleted_at=timestamp_to_datetime(proto_sheet.deleted_at) if proto_sheet.deleted_at else None
60
- )
61
-
62
- def _convert_column(self, proto_column: Any) -> Column:
63
- """转换 Proto Column 为模型"""
64
- properties = None
65
- if proto_column.properties:
66
- from google.protobuf.json_format import MessageToDict
67
- properties = MessageToDict(proto_column.properties)
68
-
69
- return Column(
70
- id=proto_column.id,
71
- sheet_id=proto_column.sheet_id,
72
- org_id=proto_column.org_id,
73
- user_id=proto_column.user_id,
74
- column_key=proto_column.column_key,
75
- name=proto_column.name,
76
- column_type=proto_column.column_type,
77
- description=proto_column.description if proto_column.description else None,
78
- position=proto_column.position,
79
- width=proto_column.width if proto_column.width else None,
80
- hidden=proto_column.hidden if proto_column.hidden else None,
81
- properties=properties,
82
- version=proto_column.version,
83
- created_by_role=proto_column.created_by_role,
84
- created_by=proto_column.created_by,
85
- created_at=timestamp_to_datetime(proto_column.created_at),
86
- updated_at=timestamp_to_datetime(proto_column.updated_at),
87
- deleted_at=timestamp_to_datetime(proto_column.deleted_at) if proto_column.deleted_at else None
88
- )
89
-
90
- def _convert_row(self, proto_row: Any) -> Row:
91
- """转换 Proto Row 为模型"""
92
- return Row(
93
- id=proto_row.id,
94
- sheet_id=proto_row.sheet_id,
95
- org_id=proto_row.org_id,
96
- user_id=proto_row.user_id,
97
- row_key=proto_row.row_key,
98
- position=proto_row.position,
99
- height=proto_row.height if proto_row.height else None,
100
- hidden=proto_row.hidden if proto_row.hidden else None,
101
- version=proto_row.version,
102
- created_by_role=proto_row.created_by_role,
103
- created_by=proto_row.created_by,
104
- created_at=timestamp_to_datetime(proto_row.created_at),
105
- updated_at=timestamp_to_datetime(proto_row.updated_at),
106
- deleted_at=timestamp_to_datetime(proto_row.deleted_at) if proto_row.deleted_at else None
107
- )
108
-
109
- def _convert_cell(self, proto_cell: Any) -> Cell:
110
- """转换 Proto Cell 为模型"""
111
- styles = None
112
- if proto_cell.styles:
113
- from google.protobuf.json_format import MessageToDict
114
- styles = MessageToDict(proto_cell.styles)
115
-
116
- return Cell(
117
- id=proto_cell.id,
118
- sheet_id=proto_cell.sheet_id,
119
- column_id=proto_cell.column_id,
120
- row_id=proto_cell.row_id,
121
- org_id=proto_cell.org_id,
122
- user_id=proto_cell.user_id,
123
- column_key=proto_cell.column_key,
124
- row_key=proto_cell.row_key,
125
- raw_value=proto_cell.raw_value if proto_cell.raw_value else None,
126
- formatted_value=proto_cell.formatted_value if proto_cell.formatted_value else None,
127
- formula=proto_cell.formula if proto_cell.formula else None,
128
- styles=styles,
129
- data_type=proto_cell.data_type if proto_cell.data_type else None,
130
- version=proto_cell.version,
131
- created_by_role=proto_cell.created_by_role,
132
- created_by=proto_cell.created_by,
133
- created_at=timestamp_to_datetime(proto_cell.created_at),
134
- updated_at=timestamp_to_datetime(proto_cell.updated_at),
135
- deleted_at=timestamp_to_datetime(proto_cell.deleted_at) if proto_cell.deleted_at else None
136
- )
137
-
138
- def _convert_merged_cell(self, proto_merged_cell: Any) -> MergedCell:
139
- """转换 Proto MergedCell 为模型"""
140
- return MergedCell(
141
- id=proto_merged_cell.id,
142
- sheet_id=proto_merged_cell.sheet_id,
143
- org_id=proto_merged_cell.org_id,
144
- user_id=proto_merged_cell.user_id,
145
- start_column_id=proto_merged_cell.start_column_id,
146
- end_column_id=proto_merged_cell.end_column_id,
147
- start_row_id=proto_merged_cell.start_row_id,
148
- end_row_id=proto_merged_cell.end_row_id,
149
- created_at=timestamp_to_datetime(proto_merged_cell.created_at),
150
- updated_at=timestamp_to_datetime(proto_merged_cell.updated_at),
151
- deleted_at=timestamp_to_datetime(proto_merged_cell.deleted_at) if proto_merged_cell.deleted_at else None
152
- )
153
-
154
- def _convert_table_view(self, proto_view: Any) -> TableView:
155
- """转换 Proto TableView 为模型"""
156
- from google.protobuf.json_format import MessageToDict
157
- config = MessageToDict(proto_view.config)
158
-
159
- return TableView(
160
- id=proto_view.id,
161
- table_id=proto_view.table_id,
162
- sheet_id=proto_view.sheet_id,
163
- org_id=proto_view.org_id,
164
- user_id=proto_view.user_id,
165
- name=proto_view.name,
166
- view_type=proto_view.view_type,
167
- config=config,
168
- created_at=timestamp_to_datetime(proto_view.created_at),
169
- updated_at=timestamp_to_datetime(proto_view.updated_at),
170
- deleted_at=timestamp_to_datetime(proto_view.deleted_at) if proto_view.deleted_at else None
171
- )
172
-
173
- def _convert_dict_to_struct(self, data: Dict[str, Any]) -> Any:
174
- """转换字典为 Proto Struct"""
175
- from google.protobuf.struct_pb2 import Struct
176
- from google.protobuf.json_format import ParseDict
177
-
178
- struct = Struct()
179
- ParseDict(data, struct)
180
- return struct
181
-
182
- def _convert_cell_updates_to_proto(self, updates: List[CellUpdate]) -> List[Any]:
183
- """转换 CellUpdate 列表为 Proto 格式"""
184
- from ...rpc.gen import taple_service_pb2
185
-
186
- proto_updates = []
187
- for update in updates:
188
- proto_update = taple_service_pb2.CellUpdate(
189
- column_key=update.column_key,
190
- row_key=update.row_key
191
- )
192
-
193
- if update.raw_value is not None:
194
- proto_update.raw_value = update.raw_value
195
- if update.formula is not None:
196
- proto_update.formula = update.formula
197
- if update.styles is not None:
198
- proto_update.styles.CopyFrom(self._convert_dict_to_struct(update.styles))
199
-
200
- proto_updates.append(proto_update)
201
-
202
- return proto_updates
203
-
204
- def _is_file_id(self, source: Union[str, Path]) -> bool:
205
- """判断source是否为file_id"""
206
- if isinstance(source, Path):
207
- return False
208
- # file_id 通常是UUID格式或特定格式的ID
209
- source_str = str(source)
210
- # 检查是否是文件路径
211
- if '/' in source_str or '\\' in source_str or os.path.exists(source_str):
212
- return False
213
- # 简单判断:如果长度合适且不包含文件扩展名,可能是file_id
214
- return len(source_str) > 10 and not source_str.endswith(('.csv', '.xlsx', '.xls'))
215
-
216
- def _detect_file_format(self, file_path: Union[str, Path]) -> str:
217
- """自动检测文件格式"""
218
- path = Path(file_path)
219
- extension = path.suffix.lower()
220
- if extension in ['.csv', '.tsv']:
221
- return 'csv'
222
- elif extension in ['.xlsx', '.xls']:
223
- return 'excel'
224
- else:
225
- # 尝试通过内容判断
226
- with open(path, 'rb') as f:
227
- header = f.read(8)
228
- if header.startswith(b'PK'): # ZIP文件头,Excel文件
229
- return 'excel'
230
- return 'csv' # 默认CSV
231
-
232
- def _read_csv_stream(
233
- self,
234
- file_path: Union[str, Path],
235
- encoding: str = 'utf-8',
236
- batch_size: int = 1000,
237
- delimiter: str = ','
238
- ) -> Iterator[Dict[str, Any]]:
239
- """流式读取CSV文件"""
240
- with open(file_path, 'r', encoding=encoding, errors='replace') as f:
241
- # 尝试检测分隔符
242
- sample = f.read(1024)
243
- f.seek(0)
244
- sniffer = csv.Sniffer()
245
- try:
246
- detected_delimiter = sniffer.sniff(sample).delimiter
247
- delimiter = detected_delimiter
248
- except:
249
- pass # 使用默认分隔符
250
-
251
- reader = csv.DictReader(f, delimiter=delimiter)
252
- columns = reader.fieldnames
253
-
254
- if not columns:
255
- raise ValidationError("CSV文件没有列头")
256
-
257
- # 返回列定义
258
- yield {'type': 'columns', 'data': columns}
259
-
260
- # 批量返回行数据
261
- batch = []
262
- row_count = 0
263
- for row in reader:
264
- batch.append(row)
265
- row_count += 1
266
- if len(batch) >= batch_size:
267
- yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
268
- batch = []
269
-
270
- if batch:
271
- yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
272
-
273
- def _read_excel_stream(
274
- self,
275
- file_path: Union[str, Path],
276
- batch_size: int = 1000,
277
- sheet_mapping: Optional[Dict[str, str]] = None
278
- ) -> Iterator[Dict[str, Any]]:
279
- """流式读取Excel文件"""
280
- if openpyxl is None:
281
- raise ImportError("需要安装openpyxl库来处理Excel文件: pip install openpyxl")
282
-
283
- wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
284
-
285
- for sheet_name in wb.sheetnames:
286
- if sheet_mapping and sheet_name not in sheet_mapping:
287
- continue
288
-
289
- ws = wb[sheet_name]
290
- rows = ws.iter_rows(values_only=True)
291
-
292
- # 获取第一行作为列头
293
- headers = None
294
- for row in rows:
295
- if row and any(cell is not None for cell in row):
296
- headers = [str(cell) if cell is not None else f"Column_{i+1}"
297
- for i, cell in enumerate(row)]
298
- break
299
-
300
- if not headers:
301
- continue
302
-
303
- # 返回工作表信息
304
- target_sheet_name = sheet_mapping.get(sheet_name, sheet_name) if sheet_mapping else sheet_name
305
- yield {
306
- 'type': 'sheet',
307
- 'name': target_sheet_name,
308
- 'columns': headers,
309
- 'original_name': sheet_name
310
- }
311
-
312
- # 批量返回行数据
313
- batch = []
314
- row_count = 0
315
- for row in rows:
316
- if row and any(cell is not None for cell in row):
317
- row_dict = {}
318
- for i, (header, value) in enumerate(zip(headers, row)):
319
- if value is not None:
320
- row_dict[header] = value
321
- if row_dict: # 只添加非空行
322
- batch.append(row_dict)
323
- row_count += 1
324
- if len(batch) >= batch_size:
325
- yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
326
- batch = []
327
-
328
- if batch:
329
- yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
330
-
331
- wb.close()
332
-
333
- def _infer_column_type(self, values: List[Any]) -> str:
334
- """推断列类型"""
335
- if not values:
336
- return 'text'
337
-
338
- # 检查数值类型
339
- numeric_count = 0
340
- for value in values[:100]: # 只检查前100个值
341
- if value is None or str(value).strip() == '':
342
- continue
343
- try:
344
- float(str(value))
345
- numeric_count += 1
346
- except:
347
- pass
348
-
349
- if numeric_count > len(values) * 0.8: # 80%以上是数字
350
- return 'number'
351
-
352
- # 默认为文本
1
+ """
2
+ Taple 服务基类
3
+ """
4
+ import csv
5
+ import tempfile
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Union, Iterator, Callable, Tuple
9
+ try:
10
+ import openpyxl
11
+ except ImportError:
12
+ openpyxl = None
13
+
14
+ from ...schemas.taple import (
15
+ Table, Sheet, Column, Row, Cell, MergedCell, TableView,
16
+ CellUpdate
17
+ )
18
+ from ...utils.converter import timestamp_to_datetime
19
+ from .idempotent_taple_mixin import IdempotentTapleMixin
20
+ from ...errors import ValidationError
21
+
22
+
23
+ class BaseTapleService(IdempotentTapleMixin):
24
+ """
25
+ Taple 服务基类,提供通用的数据转换方法
26
+ """
27
+
28
+ def _convert_table(self, proto_table: Any) -> Table:
29
+ """转换 Proto Table 为模型"""
30
+ return Table(
31
+ id=proto_table.id,
32
+ file_id=proto_table.file_id,
33
+ org_id=proto_table.org_id,
34
+ user_id=proto_table.user_id,
35
+ name=proto_table.name if proto_table.name else None,
36
+ description=proto_table.description if proto_table.description else None,
37
+ created_by_role=proto_table.created_by_role,
38
+ created_by=proto_table.created_by,
39
+ created_at=timestamp_to_datetime(proto_table.created_at),
40
+ updated_at=timestamp_to_datetime(proto_table.updated_at),
41
+ deleted_at=timestamp_to_datetime(proto_table.deleted_at) if proto_table.deleted_at else None,
42
+ )
43
+
44
+ def _convert_sheet(self, proto_sheet: Any) -> Sheet:
45
+ """转换 Proto Sheet 为模型"""
46
+ return Sheet(
47
+ id=proto_sheet.id,
48
+ table_id=proto_sheet.table_id,
49
+ org_id=proto_sheet.org_id,
50
+ user_id=proto_sheet.user_id,
51
+ name=proto_sheet.name,
52
+ description=proto_sheet.description if proto_sheet.description else None,
53
+ position=proto_sheet.position,
54
+ version=proto_sheet.version,
55
+ created_by_role=proto_sheet.created_by_role,
56
+ created_by=proto_sheet.created_by,
57
+ created_at=timestamp_to_datetime(proto_sheet.created_at),
58
+ updated_at=timestamp_to_datetime(proto_sheet.updated_at),
59
+ deleted_at=timestamp_to_datetime(proto_sheet.deleted_at) if proto_sheet.deleted_at else None
60
+ )
61
+
62
+ def _convert_column(self, proto_column: Any) -> Column:
63
+ """转换 Proto Column 为模型"""
64
+ properties = None
65
+ if proto_column.properties:
66
+ from google.protobuf.json_format import MessageToDict
67
+ properties = MessageToDict(proto_column.properties)
68
+
69
+ return Column(
70
+ id=proto_column.id,
71
+ sheet_id=proto_column.sheet_id,
72
+ org_id=proto_column.org_id,
73
+ user_id=proto_column.user_id,
74
+ column_key=proto_column.column_key,
75
+ name=proto_column.name,
76
+ column_type=proto_column.column_type,
77
+ description=proto_column.description if proto_column.description else None,
78
+ position=proto_column.position,
79
+ width=proto_column.width if proto_column.width else None,
80
+ hidden=proto_column.hidden if proto_column.hidden else None,
81
+ properties=properties,
82
+ version=proto_column.version,
83
+ created_by_role=proto_column.created_by_role,
84
+ created_by=proto_column.created_by,
85
+ created_at=timestamp_to_datetime(proto_column.created_at),
86
+ updated_at=timestamp_to_datetime(proto_column.updated_at),
87
+ deleted_at=timestamp_to_datetime(proto_column.deleted_at) if proto_column.deleted_at else None
88
+ )
89
+
90
+ def _convert_row(self, proto_row: Any) -> Row:
91
+ """转换 Proto Row 为模型"""
92
+ return Row(
93
+ id=proto_row.id,
94
+ sheet_id=proto_row.sheet_id,
95
+ org_id=proto_row.org_id,
96
+ user_id=proto_row.user_id,
97
+ row_key=proto_row.row_key,
98
+ position=proto_row.position,
99
+ height=proto_row.height if proto_row.height else None,
100
+ hidden=proto_row.hidden if proto_row.hidden else None,
101
+ version=proto_row.version,
102
+ created_by_role=proto_row.created_by_role,
103
+ created_by=proto_row.created_by,
104
+ created_at=timestamp_to_datetime(proto_row.created_at),
105
+ updated_at=timestamp_to_datetime(proto_row.updated_at),
106
+ deleted_at=timestamp_to_datetime(proto_row.deleted_at) if proto_row.deleted_at else None
107
+ )
108
+
109
+ def _convert_cell(self, proto_cell: Any) -> Cell:
110
+ """转换 Proto Cell 为模型"""
111
+ styles = None
112
+ if proto_cell.styles:
113
+ from google.protobuf.json_format import MessageToDict
114
+ styles = MessageToDict(proto_cell.styles)
115
+
116
+ return Cell(
117
+ id=proto_cell.id,
118
+ sheet_id=proto_cell.sheet_id,
119
+ column_id=proto_cell.column_id,
120
+ row_id=proto_cell.row_id,
121
+ org_id=proto_cell.org_id,
122
+ user_id=proto_cell.user_id,
123
+ column_key=proto_cell.column_key,
124
+ row_key=proto_cell.row_key,
125
+ raw_value=proto_cell.raw_value if proto_cell.raw_value else None,
126
+ formatted_value=proto_cell.formatted_value if proto_cell.formatted_value else None,
127
+ formula=proto_cell.formula if proto_cell.formula else None,
128
+ styles=styles,
129
+ data_type=proto_cell.data_type if proto_cell.data_type else None,
130
+ version=proto_cell.version,
131
+ created_by_role=proto_cell.created_by_role,
132
+ created_by=proto_cell.created_by,
133
+ created_at=timestamp_to_datetime(proto_cell.created_at),
134
+ updated_at=timestamp_to_datetime(proto_cell.updated_at),
135
+ deleted_at=timestamp_to_datetime(proto_cell.deleted_at) if proto_cell.deleted_at else None
136
+ )
137
+
138
+ def _convert_merged_cell(self, proto_merged_cell: Any) -> MergedCell:
139
+ """转换 Proto MergedCell 为模型"""
140
+ return MergedCell(
141
+ id=proto_merged_cell.id,
142
+ sheet_id=proto_merged_cell.sheet_id,
143
+ org_id=proto_merged_cell.org_id,
144
+ user_id=proto_merged_cell.user_id,
145
+ start_column_id=proto_merged_cell.start_column_id,
146
+ end_column_id=proto_merged_cell.end_column_id,
147
+ start_row_id=proto_merged_cell.start_row_id,
148
+ end_row_id=proto_merged_cell.end_row_id,
149
+ created_at=timestamp_to_datetime(proto_merged_cell.created_at),
150
+ updated_at=timestamp_to_datetime(proto_merged_cell.updated_at),
151
+ deleted_at=timestamp_to_datetime(proto_merged_cell.deleted_at) if proto_merged_cell.deleted_at else None
152
+ )
153
+
154
+ def _convert_table_view(self, proto_view: Any) -> TableView:
155
+ """转换 Proto TableView 为模型"""
156
+ from google.protobuf.json_format import MessageToDict
157
+
158
+ # 转换配置字段
159
+ config = MessageToDict(proto_view.config) if proto_view.HasField('config') else {}
160
+ filter_criteria = MessageToDict(proto_view.filter_criteria) if proto_view.HasField('filter_criteria') else None
161
+ sort_criteria = MessageToDict(proto_view.sort_criteria) if proto_view.HasField('sort_criteria') else None
162
+ if proto_view.HasField('visible_columns'):
163
+ visible_columns_dict = MessageToDict(proto_view.visible_columns)
164
+ # 如果服务器返回的是旧格式(包含 items 字段的结构),需要转换
165
+ if isinstance(visible_columns_dict, dict) and 'items' in visible_columns_dict:
166
+ # 旧格式:将列表转换为字典,默认所有列都显示
167
+ if isinstance(visible_columns_dict['items'], list):
168
+ visible_columns = {col: True for col in visible_columns_dict['items']}
169
+ else:
170
+ visible_columns = visible_columns_dict
171
+ else:
172
+ visible_columns = visible_columns_dict
173
+ else:
174
+ visible_columns = None
175
+ group_criteria = MessageToDict(proto_view.group_criteria) if proto_view.HasField('group_criteria') else None
176
+
177
+ return TableView(
178
+ id=proto_view.id,
179
+ table_id=proto_view.table_id,
180
+ sheet_id=proto_view.sheet_id,
181
+ org_id=proto_view.org_id,
182
+ user_id=proto_view.user_id,
183
+ file_id=proto_view.file_id,
184
+ filter_criteria=filter_criteria,
185
+ sort_criteria=sort_criteria,
186
+ visible_columns=visible_columns,
187
+ group_criteria=group_criteria,
188
+ created_by_role=proto_view.created_by_role,
189
+ created_by=proto_view.created_by,
190
+ view_name=proto_view.view_name,
191
+ view_type=proto_view.view_type,
192
+ is_hidden=proto_view.is_hidden,
193
+ is_default=proto_view.is_default,
194
+ config=config,
195
+ created_at=timestamp_to_datetime(proto_view.created_at),
196
+ updated_at=timestamp_to_datetime(proto_view.updated_at),
197
+ deleted_at=timestamp_to_datetime(proto_view.deleted_at) if proto_view.HasField('deleted_at') else None
198
+ )
199
+
200
+ def _convert_dict_to_struct(self, data: Dict[str, Any]) -> Any:
201
+ """转换字典为 Proto Struct"""
202
+ from google.protobuf.struct_pb2 import Struct
203
+ from google.protobuf.json_format import ParseDict
204
+
205
+ struct = Struct()
206
+ ParseDict(data, struct)
207
+ return struct
208
+
209
+ def _convert_cell_updates_to_proto(self, updates: List[CellUpdate]) -> List[Any]:
210
+ """转换 CellUpdate 列表为 Proto 格式"""
211
+ from ...rpc.gen import taple_service_pb2
212
+
213
+ proto_updates = []
214
+ for update in updates:
215
+ proto_update = taple_service_pb2.CellUpdate(
216
+ column_key=update.column_key,
217
+ row_key=update.row_key
218
+ )
219
+
220
+ if update.raw_value is not None:
221
+ proto_update.raw_value = update.raw_value
222
+ if update.formula is not None:
223
+ proto_update.formula = update.formula
224
+ if update.styles is not None:
225
+ proto_update.styles.CopyFrom(self._convert_dict_to_struct(update.styles))
226
+
227
+ proto_updates.append(proto_update)
228
+
229
+ return proto_updates
230
+
231
+ def _is_file_id(self, source: Union[str, Path]) -> bool:
232
+ """判断source是否为file_id"""
233
+ if isinstance(source, Path):
234
+ return False
235
+ # file_id 通常是UUID格式或特定格式的ID
236
+ source_str = str(source)
237
+ # 检查是否是文件路径
238
+ if '/' in source_str or '\\' in source_str or os.path.exists(source_str):
239
+ return False
240
+ # 简单判断:如果长度合适且不包含文件扩展名,可能是file_id
241
+ return len(source_str) > 10 and not source_str.endswith(('.csv', '.xlsx', '.xls'))
242
+
243
+ def _detect_file_format(self, file_path: Union[str, Path]) -> str:
244
+ """自动检测文件格式"""
245
+ path = Path(file_path)
246
+ extension = path.suffix.lower()
247
+ if extension in ['.csv', '.tsv']:
248
+ return 'csv'
249
+ elif extension in ['.xlsx', '.xls']:
250
+ return 'excel'
251
+ else:
252
+ # 尝试通过内容判断
253
+ with open(path, 'rb') as f:
254
+ header = f.read(8)
255
+ if header.startswith(b'PK'): # ZIP文件头,Excel文件
256
+ return 'excel'
257
+ return 'csv' # 默认CSV
258
+
259
+ def _read_csv_stream(
260
+ self,
261
+ file_path: Union[str, Path],
262
+ encoding: str = 'utf-8',
263
+ batch_size: int = 1000,
264
+ delimiter: str = ','
265
+ ) -> Iterator[Dict[str, Any]]:
266
+ """流式读取CSV文件"""
267
+ with open(file_path, 'r', encoding=encoding, errors='replace') as f:
268
+ # 尝试检测分隔符
269
+ sample = f.read(1024)
270
+ f.seek(0)
271
+ sniffer = csv.Sniffer()
272
+ try:
273
+ detected_delimiter = sniffer.sniff(sample).delimiter
274
+ delimiter = detected_delimiter
275
+ except:
276
+ pass # 使用默认分隔符
277
+
278
+ reader = csv.DictReader(f, delimiter=delimiter)
279
+ columns = reader.fieldnames
280
+
281
+ if not columns:
282
+ raise ValidationError("CSV文件没有列头")
283
+
284
+ # 返回列定义
285
+ yield {'type': 'columns', 'data': columns}
286
+
287
+ # 批量返回行数据
288
+ batch = []
289
+ row_count = 0
290
+ for row in reader:
291
+ batch.append(row)
292
+ row_count += 1
293
+ if len(batch) >= batch_size:
294
+ yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
295
+ batch = []
296
+
297
+ if batch:
298
+ yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
299
+
300
+ def _read_excel_stream(
301
+ self,
302
+ file_path: Union[str, Path],
303
+ batch_size: int = 1000,
304
+ sheet_mapping: Optional[Dict[str, str]] = None
305
+ ) -> Iterator[Dict[str, Any]]:
306
+ """流式读取Excel文件"""
307
+ if openpyxl is None:
308
+ raise ImportError("需要安装openpyxl库来处理Excel文件: pip install openpyxl")
309
+
310
+ wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
311
+
312
+ for sheet_name in wb.sheetnames:
313
+ if sheet_mapping and sheet_name not in sheet_mapping:
314
+ continue
315
+
316
+ ws = wb[sheet_name]
317
+ rows = ws.iter_rows(values_only=True)
318
+
319
+ # 获取第一行作为列头
320
+ headers = None
321
+ for row in rows:
322
+ if row and any(cell is not None for cell in row):
323
+ headers = [str(cell) if cell is not None else f"Column_{i+1}"
324
+ for i, cell in enumerate(row)]
325
+ break
326
+
327
+ if not headers:
328
+ continue
329
+
330
+ # 返回工作表信息
331
+ target_sheet_name = sheet_mapping.get(sheet_name, sheet_name) if sheet_mapping else sheet_name
332
+ yield {
333
+ 'type': 'sheet',
334
+ 'name': target_sheet_name,
335
+ 'columns': headers,
336
+ 'original_name': sheet_name
337
+ }
338
+
339
+ # 批量返回行数据
340
+ batch = []
341
+ row_count = 0
342
+ for row in rows:
343
+ if row and any(cell is not None for cell in row):
344
+ row_dict = {}
345
+ for i, (header, value) in enumerate(zip(headers, row)):
346
+ if value is not None:
347
+ row_dict[header] = value
348
+ if row_dict: # 只添加非空行
349
+ batch.append(row_dict)
350
+ row_count += 1
351
+ if len(batch) >= batch_size:
352
+ yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
353
+ batch = []
354
+
355
+ if batch:
356
+ yield {'type': 'rows', 'data': batch, 'start_index': row_count - len(batch)}
357
+
358
+ wb.close()
359
+
360
+ def _infer_column_type(self, values: List[Any]) -> str:
361
+ """推断列类型"""
362
+ if not values:
363
+ return 'text'
364
+
365
+ # 检查数值类型
366
+ numeric_count = 0
367
+ for value in values[:100]: # 只检查前100个值
368
+ if value is None or str(value).strip() == '':
369
+ continue
370
+ try:
371
+ float(str(value))
372
+ numeric_count += 1
373
+ except:
374
+ pass
375
+
376
+ if numeric_count > len(values) * 0.8: # 80%以上是数字
377
+ return 'number'
378
+
379
+ # 默认为文本
353
380
  return 'text'