bella-openapi 1.0.2.4__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bella_openapi/__init__.py +2 -13
- bella_openapi/entity/standard_domtree.py +44 -33
- {bella_openapi-1.0.2.4.dist-info → bella_openapi-1.0.3.dist-info}/METADATA +1 -1
- {bella_openapi-1.0.2.4.dist-info → bella_openapi-1.0.3.dist-info}/RECORD +7 -7
- {bella_openapi-1.0.2.4.dist-info → bella_openapi-1.0.3.dist-info}/WHEEL +0 -0
- {bella_openapi-1.0.2.4.dist-info → bella_openapi-1.0.3.dist-info}/licenses/LICENSE +0 -0
- {bella_openapi-1.0.2.4.dist-info → bella_openapi-1.0.3.dist-info}/top_level.txt +0 -0
bella_openapi/__init__.py
CHANGED
@@ -3,19 +3,8 @@ from .log import operation_log, submit_log
|
|
3
3
|
from .openapi_contexvar import trace_id_context, caller_id_context, request_url_context
|
4
4
|
from .auth_billing import ErrorInfo, async_authenticate_decorator_args, authenticate_user, print_context, \
|
5
5
|
get_context, set_context, clean_context, report
|
6
|
-
from .entity import
|
7
|
-
|
8
|
-
StandardNode,
|
9
|
-
SourceFile,
|
10
|
-
StandardPosition,
|
11
|
-
StandardImage,
|
12
|
-
Cell,
|
13
|
-
StandardRow,
|
14
|
-
StandardBaseElement,
|
15
|
-
StandardElement,
|
16
|
-
StandardTableElement,
|
17
|
-
StandardImageElement
|
18
|
-
)
|
6
|
+
from .entity import StandardDomTree, StandardNode, SourceFile, StandardPosition, StandardImage, Cell, \
|
7
|
+
StandardRow, StandardBaseElement, StandardElement, StandardTableElement, StandardImageElement
|
19
8
|
|
20
9
|
__all__ = ["validate_token", "operation_log",
|
21
10
|
"support_model",
|
@@ -31,16 +31,7 @@ layout_type_mapping = {
|
|
31
31
|
"TableNote": "Text", # 目前实际解析出来没有
|
32
32
|
}
|
33
33
|
|
34
|
-
|
35
|
-
def count_tokens(text: str, model: str = "gpt-4") -> int:
|
36
|
-
if not text:
|
37
|
-
return 0
|
38
|
-
encoding = tiktoken.encoding_for_model(model)
|
39
|
-
tokens = encoding.encode(text)
|
40
|
-
# 计算标记列表的长度,即标记的数量
|
41
|
-
token_count = len(tokens)
|
42
|
-
# 返回标记的数量
|
43
|
-
return token_count
|
34
|
+
|
44
35
|
|
45
36
|
class SourceFile(BaseModel):
|
46
37
|
id: str # 文件ID,唯一标识符,类型为string
|
@@ -315,18 +306,16 @@ class StandardDomTree(BaseModel):
|
|
315
306
|
# 检查前一个节点
|
316
307
|
if i > 0:
|
317
308
|
prev_sibling = node.children[i - 1]
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
merged = True
|
309
|
+
# 找到对应类型的前一个兄弟节点,合并节点
|
310
|
+
merged = ( prev_sibling.element and prev_sibling.element.type == target_type and
|
311
|
+
cls._merge_nodes(prev_sibling, current, target_type))
|
322
312
|
|
323
313
|
# 如果没有与前一个节点合并,检查后一个节点
|
324
314
|
if not merged and i < len(node.children) - 1:
|
325
315
|
next_sibling = node.children[i + 1]
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
merged = True
|
316
|
+
# 找到对应类型的后一个兄弟节点,合并节点
|
317
|
+
merged = (next_sibling.element and next_sibling.element.type == target_type and
|
318
|
+
cls._merge_nodes(next_sibling, current, target_type))
|
330
319
|
|
331
320
|
# 如果没有找到对应类型的兄弟节点,将当前节点类型改为 Text
|
332
321
|
if not merged:
|
@@ -357,23 +346,25 @@ class StandardDomTree(BaseModel):
|
|
357
346
|
Returns:
|
358
347
|
bool: 是否成功合并
|
359
348
|
"""
|
360
|
-
|
361
|
-
|
349
|
+
# 定义节点类型与元素类型的映射
|
350
|
+
type_element_mapping = {
|
351
|
+
'Figure': StandardImageElement,
|
352
|
+
'Table': StandardTableElement
|
353
|
+
}
|
354
|
+
|
355
|
+
can_merge = (node_type in type_element_mapping and
|
356
|
+
isinstance(target_node.element, type_element_mapping[node_type]))
|
357
|
+
|
358
|
+
# 检查节点类型是否支持且目标节点元素类型匹配
|
359
|
+
if can_merge:
|
360
|
+
# 将源节点的文本作为目标节点的 name
|
362
361
|
target_node.element.name = source_node.element.text
|
363
362
|
# 更新 tokens 计数
|
364
363
|
target_node.tokens += source_node.tokens
|
365
|
-
#
|
364
|
+
# 将源节点的位置添加到目标节点中
|
366
365
|
target_node.element.positions += source_node.element.positions
|
367
|
-
|
368
|
-
|
369
|
-
# 将 TableName 的文本作为 Table 的 name
|
370
|
-
target_node.element.name = source_node.element.text
|
371
|
-
# 更新 tokens 计数
|
372
|
-
target_node.tokens += source_node.tokens
|
373
|
-
# 将 Table 的位置添加到 Figure 中
|
374
|
-
target_node.element.positions += source_node.element.positions
|
375
|
-
return True
|
376
|
-
return False
|
366
|
+
|
367
|
+
return can_merge
|
377
368
|
|
378
369
|
@classmethod
|
379
370
|
def _from_domtree_node_to_base_info(cls, node: dict) -> Optional[StandardNode]:
|
@@ -440,7 +431,7 @@ class StandardDomTree(BaseModel):
|
|
440
431
|
cell_data['end_col']],
|
441
432
|
text=cell_text,
|
442
433
|
# 目前只会有一个元素,且是Text类型,Path重新从头编号,相对cell是root
|
443
|
-
nodes=[StandardNode(summary="", tokens=count_tokens(cell_text), path=[1], children=[],
|
434
|
+
nodes=[StandardNode(summary="", tokens=cls.count_tokens(cell_text), path=[1], children=[],
|
444
435
|
element=StandardElement(
|
445
436
|
type='Text',
|
446
437
|
positions=[],
|
@@ -491,7 +482,7 @@ class StandardDomTree(BaseModel):
|
|
491
482
|
standard_node.children.append(standard_child)
|
492
483
|
|
493
484
|
# 计算 token 数量:自身 text 的 token 数量 + 子节点 token 数量
|
494
|
-
tokens = count_tokens(text)
|
485
|
+
tokens = cls.count_tokens(text)
|
495
486
|
for child in standard_node.children:
|
496
487
|
tokens += child.tokens
|
497
488
|
|
@@ -500,3 +491,23 @@ class StandardDomTree(BaseModel):
|
|
500
491
|
|
501
492
|
return standard_node
|
502
493
|
|
494
|
+
@classmethod
|
495
|
+
def count_tokens(cls, text: str) -> int:
|
496
|
+
"""
|
497
|
+
计算文本的token数量
|
498
|
+
|
499
|
+
Args:
|
500
|
+
text: 要计算的文本
|
501
|
+
|
502
|
+
Returns:
|
503
|
+
int: token数量
|
504
|
+
"""
|
505
|
+
model = "gpt-4" # 使用模型默认为gpt-4
|
506
|
+
if not text:
|
507
|
+
return 0
|
508
|
+
encoding = tiktoken.encoding_for_model(model)
|
509
|
+
tokens = encoding.encode(text)
|
510
|
+
# 计算标记列表的长度,即标记的数量
|
511
|
+
token_count = len(tokens)
|
512
|
+
# 返回标记的数量
|
513
|
+
return token_count
|
@@ -1,4 +1,4 @@
|
|
1
|
-
bella_openapi/__init__.py,sha256=
|
1
|
+
bella_openapi/__init__.py,sha256=RBVZURWQwmb8dN7S6cgDej29bcg_EiHhCmpx1qQMgnw,1420
|
2
2
|
bella_openapi/auth_billing.py,sha256=Hn0KS8GuG48etnvnd1Faej4IfFXD3tjzalUzDnpZh7Q,3520
|
3
3
|
bella_openapi/authorize.py,sha256=cO6J-wx9dmmkDAeqpXT7QlyCr13hO-HSC5SWQSw2gZw,2150
|
4
4
|
bella_openapi/config.py,sha256=Dn8vnToDaOesPGboauxCCwNrW5awQLeSkmDjNjXS4bQ,319
|
@@ -14,11 +14,11 @@ bella_openapi/bella_trace/trace_requests.py,sha256=ADA8J_gbC3TwUo5LWQ3c_yTmCSZRa
|
|
14
14
|
bella_openapi/console/__init__.py,sha256=uSfr5v6JLRSqTlftjK_ZU1pnbkEyxAPbuQbMyYX_phk,64
|
15
15
|
bella_openapi/console/models.py,sha256=Hh1UuYHIxFtF9r5QK-pSJPFrSqbZUHv6spLvPbCeX08,1274
|
16
16
|
bella_openapi/entity/__init__.py,sha256=zzsYYg859pzPSgx1Py2kxB2ozQ0tt4OtTatBtpm2bAw,512
|
17
|
-
bella_openapi/entity/standard_domtree.py,sha256=
|
17
|
+
bella_openapi/entity/standard_domtree.py,sha256=oN2z_t5uTzO01szVSEnsN_GWYtxKkgvQPgJuOnOOaFs,20270
|
18
18
|
bella_openapi/middleware/__init__.py,sha256=XWvZG1xO30ZXIn10YVYthmT1BV-9fonMEP_jVRZbAlQ,157
|
19
19
|
bella_openapi/middleware/context_middleware.py,sha256=YawQyKAxMzvlDs_MxcuQKh90pP6VoMKzCBDS94qmlzQ,3870
|
20
|
-
bella_openapi-1.0.
|
21
|
-
bella_openapi-1.0.
|
22
|
-
bella_openapi-1.0.
|
23
|
-
bella_openapi-1.0.
|
24
|
-
bella_openapi-1.0.
|
20
|
+
bella_openapi-1.0.3.dist-info/licenses/LICENSE,sha256=O-0zMbcEi6wXz1DiSdVgzMlQjJcNqNe5KDv08uYzqR0,1055
|
21
|
+
bella_openapi-1.0.3.dist-info/METADATA,sha256=DauqWLJgwkWy0eSl-l_HISwuvEQ1t3NScpBmkZ9qa6c,9375
|
22
|
+
bella_openapi-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
23
|
+
bella_openapi-1.0.3.dist-info/top_level.txt,sha256=EZuq3F6tKeF-vmZQi6_S2XzmES7SPW7HAbGN1Uv9vN8,14
|
24
|
+
bella_openapi-1.0.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|