xparse-client 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xparse_client/__init__.py +4 -2
- {xparse_client-0.2.0.dist-info → xparse_client-0.2.2.dist-info}/METADATA +223 -125
- xparse_client-0.2.2.dist-info/RECORD +6 -0
- xparse_client-0.2.0.dist-info/RECORD +0 -6
- {xparse_client-0.2.0.dist-info → xparse_client-0.2.2.dist-info}/WHEEL +0 -0
- {xparse_client-0.2.0.dist-info → xparse_client-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {xparse_client-0.2.0.dist-info → xparse_client-0.2.2.dist-info}/top_level.txt +0 -0
xparse_client/__init__.py
CHANGED
|
@@ -6,7 +6,7 @@ logging.basicConfig(
|
|
|
6
6
|
encoding='utf-8'
|
|
7
7
|
)
|
|
8
8
|
|
|
9
|
-
from .pipeline.config import ParseConfig, ChunkConfig, EmbedConfig, PipelineStats
|
|
9
|
+
from .pipeline.config import ParseConfig, ChunkConfig, EmbedConfig, Stage, PipelineStats, PipelineConfig
|
|
10
10
|
from .pipeline.sources import Source, S3Source, LocalSource, FtpSource, SmbSource
|
|
11
11
|
from .pipeline.destinations import Destination, MilvusDestination, LocalDestination, S3Destination
|
|
12
12
|
from .pipeline.pipeline import Pipeline, create_pipeline_from_config
|
|
@@ -15,7 +15,9 @@ __all__ = [
|
|
|
15
15
|
'ParseConfig',
|
|
16
16
|
'ChunkConfig',
|
|
17
17
|
'EmbedConfig',
|
|
18
|
+
'Stage',
|
|
18
19
|
'PipelineStats',
|
|
20
|
+
'PipelineConfig',
|
|
19
21
|
'Source',
|
|
20
22
|
'S3Source',
|
|
21
23
|
'LocalSource',
|
|
@@ -29,4 +31,4 @@ __all__ = [
|
|
|
29
31
|
'create_pipeline_from_config',
|
|
30
32
|
]
|
|
31
33
|
|
|
32
|
-
__version__ = '0.2.
|
|
34
|
+
__version__ = '0.2.2'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xparse-client
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: 面向Agent和RAG的新一代文档处理 AI Infra
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
|
|
@@ -69,25 +69,32 @@ pip install --upgrade xparse-client
|
|
|
69
69
|
|
|
70
70
|
#### 代码配置
|
|
71
71
|
```python
|
|
72
|
-
from xparse_client import ParseConfig, ChunkConfig, EmbedConfig, Pipeline, S3Source, MilvusDestination
|
|
73
|
-
|
|
74
|
-
#
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
72
|
+
from xparse_client import ParseConfig, ChunkConfig, EmbedConfig, Stage, Pipeline, S3Source, MilvusDestination
|
|
73
|
+
|
|
74
|
+
# 使用新的 stages 格式创建配置
|
|
75
|
+
stages = [
|
|
76
|
+
Stage(
|
|
77
|
+
type='parse',
|
|
78
|
+
config=ParseConfig(provider='textin')
|
|
79
|
+
),
|
|
80
|
+
Stage(
|
|
81
|
+
type='chunk',
|
|
82
|
+
config=ChunkConfig(
|
|
83
|
+
strategy='by_title',
|
|
84
|
+
include_orig_elements=False,
|
|
85
|
+
new_after_n_chars=512,
|
|
86
|
+
max_characters=1024,
|
|
87
|
+
overlap=50
|
|
88
|
+
)
|
|
89
|
+
),
|
|
90
|
+
Stage(
|
|
91
|
+
type='embed',
|
|
92
|
+
config=EmbedConfig(
|
|
93
|
+
provider='qwen',
|
|
94
|
+
model_name='text-embedding-v4'
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
]
|
|
91
98
|
|
|
92
99
|
# 创建 Pipeline
|
|
93
100
|
source = S3Source(...)
|
|
@@ -98,9 +105,7 @@ pipeline = Pipeline(
|
|
|
98
105
|
destination=destination,
|
|
99
106
|
api_base_url='https://api.textin.com/api/xparse',
|
|
100
107
|
api_headers={...},
|
|
101
|
-
|
|
102
|
-
chunk_config=chunk_config,
|
|
103
|
-
embed_config=embed_config
|
|
108
|
+
stages=stages
|
|
104
109
|
)
|
|
105
110
|
|
|
106
111
|
pipeline.run()
|
|
@@ -115,25 +120,32 @@ config = {
|
|
|
115
120
|
'api_base_url': 'https://api.textin.com/api/xparse',
|
|
116
121
|
'api_headers': {...},
|
|
117
122
|
|
|
118
|
-
#
|
|
119
|
-
'
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
123
|
+
# Stages 配置
|
|
124
|
+
'stages': [
|
|
125
|
+
{
|
|
126
|
+
'type': 'parse',
|
|
127
|
+
'config': {
|
|
128
|
+
'provider': 'textin' # 当前支持textin文档解析,未来可扩展
|
|
129
|
+
}
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
'type': 'chunk',
|
|
133
|
+
'config': {
|
|
134
|
+
'strategy': 'basic', # 分块策略: 'basic' | 'by_title' | 'by_page'
|
|
135
|
+
'include_orig_elements': False, # 是否包含原始元素
|
|
136
|
+
'new_after_n_chars': 512, # 多少字符后创建新块
|
|
137
|
+
'max_characters': 1024, # 最大字符数
|
|
138
|
+
'overlap': 0 # 重叠字符数
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
'type': 'embed',
|
|
143
|
+
'config': {
|
|
144
|
+
'provider': 'qwen', # 向量化供应商: 'qwen'/'doubao'
|
|
145
|
+
'model_name': 'text-embedding-v3' # 模型名称
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
]
|
|
137
149
|
}
|
|
138
150
|
|
|
139
151
|
# 使用配置创建 pipeline
|
|
@@ -426,8 +438,12 @@ Parse 参数中有必填项`Provider`,表示文档解析服务的供应商,
|
|
|
426
438
|
- textin: 合合信息提供的文档解析服务,在速度、准确性上均为行业领先
|
|
427
439
|
- 支持的文档解析参数参考 [TextIn 文档解析官方API文档](https://docs.textin.com/api-reference/endpoint/parse)
|
|
428
440
|
- 接口调用将按照 `TextIn 通用文档解析` 服务的计费标准进行计费
|
|
429
|
-
-
|
|
430
|
-
-
|
|
441
|
+
- textin-lite:
|
|
442
|
+
- 接口调用将按照 `TextIn 通用表格识别` 服务的计费标准进行计费
|
|
443
|
+
- mineru:
|
|
444
|
+
- 接口调用将按照 `TextIn 通用文档解析` 服务的计费标准进行计费
|
|
445
|
+
- paddle:
|
|
446
|
+
- 接口调用将按照 `TextIn 通用文档解析` 服务的计费标准进行计费
|
|
431
447
|
|
|
432
448
|
2. **Chunk Stage** (`type: "chunk"`)
|
|
433
449
|
|
|
@@ -473,21 +489,21 @@ Parse 参数中有必填项`Provider`,表示文档解析服务的供应商,
|
|
|
473
489
|
"stats": {
|
|
474
490
|
"original_elements": 10, // 原始解析的元素数量
|
|
475
491
|
"chunked_elements": 15, // 分块后的元素数量
|
|
476
|
-
"embedded_elements": 15,
|
|
477
|
-
"
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
492
|
+
"embedded_elements": 15, // 向量化后的元素数量
|
|
493
|
+
"stages": [
|
|
494
|
+
{
|
|
495
|
+
"type": "parse",
|
|
496
|
+
"config": {
|
|
497
|
+
"provider": "textin-lite"
|
|
498
|
+
}
|
|
499
|
+
},
|
|
500
|
+
{
|
|
501
|
+
"type": "chunk",
|
|
502
|
+
"config": {
|
|
503
|
+
"strategy": "by_title"
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
]
|
|
491
507
|
}
|
|
492
508
|
}
|
|
493
509
|
}
|
|
@@ -529,25 +545,32 @@ config = {
|
|
|
529
545
|
'x-ti-secret-code': 'your-secret-code'
|
|
530
546
|
},
|
|
531
547
|
|
|
532
|
-
#
|
|
533
|
-
'
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
548
|
+
# Stages 配置
|
|
549
|
+
'stages': [
|
|
550
|
+
{
|
|
551
|
+
'type': 'parse',
|
|
552
|
+
'config': {
|
|
553
|
+
'provider': 'textin'
|
|
554
|
+
}
|
|
555
|
+
},
|
|
556
|
+
{
|
|
557
|
+
'type': 'chunk',
|
|
558
|
+
'config': {
|
|
559
|
+
'strategy': 'by_title', # 按标题分块
|
|
560
|
+
'include_orig_elements': False,
|
|
561
|
+
'new_after_n_chars': 512,
|
|
562
|
+
'max_characters': 1024,
|
|
563
|
+
'overlap': 50 # 块之间重叠 50 字符
|
|
564
|
+
}
|
|
565
|
+
},
|
|
566
|
+
{
|
|
567
|
+
'type': 'embed',
|
|
568
|
+
'config': {
|
|
569
|
+
'provider': 'qwen',
|
|
570
|
+
'model_name': 'text-embedding-v3'
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
]
|
|
551
574
|
}
|
|
552
575
|
|
|
553
576
|
# 使用配置创建并运行 pipeline
|
|
@@ -572,15 +595,33 @@ config = {
|
|
|
572
595
|
'output_dir': './test_output'
|
|
573
596
|
},
|
|
574
597
|
'api_base_url': 'https://api.textin.com/api/xparse',
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
'
|
|
578
|
-
'max_characters': 1024
|
|
598
|
+
'api_headers': {
|
|
599
|
+
'x-ti-app-id': 'your-app-id',
|
|
600
|
+
'x-ti-secret-code': 'your-secret-code'
|
|
579
601
|
},
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
602
|
+
# Stages 配置
|
|
603
|
+
'stages': [
|
|
604
|
+
{
|
|
605
|
+
'type': 'parse',
|
|
606
|
+
'config': {
|
|
607
|
+
'provider': 'textin'
|
|
608
|
+
}
|
|
609
|
+
},
|
|
610
|
+
{
|
|
611
|
+
'type': 'chunk',
|
|
612
|
+
'config': {
|
|
613
|
+
'strategy': 'basic',
|
|
614
|
+
'max_characters': 1024
|
|
615
|
+
}
|
|
616
|
+
},
|
|
617
|
+
{
|
|
618
|
+
'type': 'embed',
|
|
619
|
+
'config': {
|
|
620
|
+
'provider': 'qwen',
|
|
621
|
+
'model_name': 'text-embedding-v3'
|
|
622
|
+
}
|
|
623
|
+
}
|
|
624
|
+
]
|
|
584
625
|
}
|
|
585
626
|
|
|
586
627
|
pipeline = create_pipeline_from_config(config)
|
|
@@ -598,14 +639,29 @@ config_by_page = {
|
|
|
598
639
|
'destination': {...},
|
|
599
640
|
'api_base_url': 'https://api.textin.com/api/xparse',
|
|
600
641
|
'api_headers': {...},
|
|
601
|
-
'
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
642
|
+
'stages': [
|
|
643
|
+
{
|
|
644
|
+
'type': 'parse',
|
|
645
|
+
'config': {
|
|
646
|
+
'provider': 'textin'
|
|
647
|
+
}
|
|
648
|
+
},
|
|
649
|
+
{
|
|
650
|
+
'type': 'chunk',
|
|
651
|
+
'config': {
|
|
652
|
+
'strategy': 'by_page', # 按页面分块
|
|
653
|
+
'max_characters': 2048, # 增大块大小
|
|
654
|
+
'overlap': 100 # 页面间重叠 100 字符
|
|
655
|
+
}
|
|
656
|
+
},
|
|
657
|
+
{
|
|
658
|
+
'type': 'embed',
|
|
659
|
+
'config': {
|
|
660
|
+
'provider': 'qwen',
|
|
661
|
+
'model_name': 'text-embedding-v4' # 使用更高精度的模型
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
]
|
|
609
665
|
}
|
|
610
666
|
|
|
611
667
|
# 配置 2:按标题分块(适合结构化文档)
|
|
@@ -614,15 +670,29 @@ config_by_title = {
|
|
|
614
670
|
'destination': {...},
|
|
615
671
|
'api_base_url': 'https://api.textin.com/api/xparse',
|
|
616
672
|
'api_headers': {...},
|
|
617
|
-
'
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
673
|
+
'stages': [
|
|
674
|
+
{
|
|
675
|
+
'type': 'parse',
|
|
676
|
+
'config': {
|
|
677
|
+
'provider': 'textin'
|
|
678
|
+
}
|
|
679
|
+
},
|
|
680
|
+
{
|
|
681
|
+
'type': 'chunk',
|
|
682
|
+
'config': {
|
|
683
|
+
'strategy': 'by_title', # 按标题分块
|
|
684
|
+
'include_orig_elements': True, # 保留原始元素信息
|
|
685
|
+
'max_characters': 1536
|
|
686
|
+
}
|
|
687
|
+
},
|
|
688
|
+
{
|
|
689
|
+
'type': 'embed',
|
|
690
|
+
'config': {
|
|
691
|
+
'provider': 'qwen',
|
|
692
|
+
'model_name': 'text-embedding-v3'
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
]
|
|
626
696
|
}
|
|
627
697
|
|
|
628
698
|
# 根据文档类型选择配置
|
|
@@ -659,15 +729,29 @@ config = {
|
|
|
659
729
|
'x-ti-secret-code': 'secret'
|
|
660
730
|
},
|
|
661
731
|
|
|
662
|
-
#
|
|
663
|
-
'
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
732
|
+
# Stages 配置
|
|
733
|
+
'stages': [
|
|
734
|
+
{
|
|
735
|
+
'type': 'parse',
|
|
736
|
+
'config': {
|
|
737
|
+
'provider': 'textin'
|
|
738
|
+
}
|
|
739
|
+
},
|
|
740
|
+
{
|
|
741
|
+
'type': 'chunk',
|
|
742
|
+
'config': {
|
|
743
|
+
'strategy': 'basic',
|
|
744
|
+
'max_characters': 1024
|
|
745
|
+
}
|
|
746
|
+
},
|
|
747
|
+
{
|
|
748
|
+
'type': 'embed',
|
|
749
|
+
'config': {
|
|
750
|
+
'provider': 'qwen',
|
|
751
|
+
'model_name': 'text-embedding-v3'
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
]
|
|
671
755
|
}
|
|
672
756
|
|
|
673
757
|
pipeline = create_pipeline_from_config(config)
|
|
@@ -690,14 +774,32 @@ config = {
|
|
|
690
774
|
'output_dir': './output'
|
|
691
775
|
},
|
|
692
776
|
'api_base_url': 'https://api.textin.com/api/xparse',
|
|
693
|
-
'
|
|
694
|
-
'
|
|
695
|
-
'
|
|
777
|
+
'api_headers': {
|
|
778
|
+
'x-ti-app-id': 'your-app-id',
|
|
779
|
+
'x-ti-secret-code': 'your-secret-code'
|
|
696
780
|
},
|
|
697
|
-
'
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
781
|
+
'stages': [
|
|
782
|
+
{
|
|
783
|
+
'type': 'parse',
|
|
784
|
+
'config': {
|
|
785
|
+
'provider': 'textin'
|
|
786
|
+
}
|
|
787
|
+
},
|
|
788
|
+
{
|
|
789
|
+
'type': 'chunk',
|
|
790
|
+
'config': {
|
|
791
|
+
'strategy': 'basic',
|
|
792
|
+
'max_characters': 1024
|
|
793
|
+
}
|
|
794
|
+
},
|
|
795
|
+
{
|
|
796
|
+
'type': 'embed',
|
|
797
|
+
'config': {
|
|
798
|
+
'provider': 'qwen',
|
|
799
|
+
'model_name': 'text-embedding-v3'
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
]
|
|
701
803
|
}
|
|
702
804
|
|
|
703
805
|
pipeline = create_pipeline_from_config(config)
|
|
@@ -712,9 +814,7 @@ if result:
|
|
|
712
814
|
print(f"原始元素: {stats.original_elements}")
|
|
713
815
|
print(f"分块后: {stats.chunked_elements}")
|
|
714
816
|
print(f"向量化: {stats.embedded_elements}")
|
|
715
|
-
print(f"
|
|
716
|
-
print(f" - 分块策略: {stats.chunk_config.strategy}")
|
|
717
|
-
print(f" - 向量模型: {stats.embed_config.model_name}")
|
|
817
|
+
print(f"执行的 stages: {[s.type for s in stats.stages]}")
|
|
718
818
|
|
|
719
819
|
# 写入目的地
|
|
720
820
|
metadata = {
|
|
@@ -733,9 +833,7 @@ Pipeline 接口会返回详细的处理统计信息:
|
|
|
733
833
|
| `original_elements` | int | 原始解析的元素数量 |
|
|
734
834
|
| `chunked_elements` | int | 分块后的元素数量 |
|
|
735
835
|
| `embedded_elements` | int | 向量化后的元素数量 |
|
|
736
|
-
| `
|
|
737
|
-
| `chunk_config` | ChunkConfig | 使用的分块配置 |
|
|
738
|
-
| `embed_config` | EmbedConfig | 使用的向量化配置 |
|
|
836
|
+
| `stages` | List[Stage] | 实际执行的 stages 配置 |
|
|
739
837
|
|
|
740
838
|
**示例输出:**
|
|
741
839
|
```
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
xparse_client/__init__.py,sha256=3c_nFCooim4J31P0QWPM2VqdFJiHNFdH44IRmIIHvjk,901
|
|
2
|
+
xparse_client-0.2.2.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
|
|
3
|
+
xparse_client-0.2.2.dist-info/METADATA,sha256=F390ItwP4PGN-9FvJGOgqjtX7YWgMJolSJnxqbEBq08,26508
|
|
4
|
+
xparse_client-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
5
|
+
xparse_client-0.2.2.dist-info/top_level.txt,sha256=W5PeQwOyfo_Od3d26-gcOtan7rHYk1q3SP1phYedat4,14
|
|
6
|
+
xparse_client-0.2.2.dist-info/RECORD,,
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
xparse_client/__init__.py,sha256=9xmthqT95qlxUT0i2KNwTR4ze4031aj9he1dEv6haHg,843
|
|
2
|
-
xparse_client-0.2.0.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
|
|
3
|
-
xparse_client-0.2.0.dist-info/METADATA,sha256=zwQomwSi4EatL0oBfpercGcI7IAinHuXvuuVpCkYrVY,24415
|
|
4
|
-
xparse_client-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
5
|
-
xparse_client-0.2.0.dist-info/top_level.txt,sha256=W5PeQwOyfo_Od3d26-gcOtan7rHYk1q3SP1phYedat4,14
|
|
6
|
-
xparse_client-0.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|