xparse-client 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
example/run_pipeline.py CHANGED
@@ -5,6 +5,7 @@ Pipeline 运行脚本
5
5
  快速启动和运行 Pipeline 的示例
6
6
  '''
7
7
 
8
+ import json
8
9
  from datetime import datetime, timezone
9
10
  from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
10
11
 
@@ -175,7 +176,7 @@ def run_with_manual_setup():
175
176
 
176
177
  destination = MilvusDestination(
177
178
  db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
178
- collection_name='textin_test_3', # 数据库collection名称
179
+ collection_name='textin_test_3_copy', # 数据库collection名称
179
180
  dimension=1024, # 向量维度,需与 embed API 返回一致
180
181
  api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
181
182
  )
@@ -235,7 +236,8 @@ def run_with_manual_setup():
235
236
  )
236
237
 
237
238
  # 运行
238
- pipeline.run()
239
+ config = pipeline.get_config()
240
+ print(json.dumps(config, ensure_ascii=False, indent=2))
239
241
 
240
242
 
241
243
  # ============================================================================
@@ -79,6 +79,139 @@ class Pipeline:
79
79
  print(f" Pipeline Config: 中间结果保存已启用")
80
80
  print("=" * 60)
81
81
 
82
+ def get_config(self) -> Dict[str, Any]:
83
+ """获取 Pipeline 的完整配置信息,返回字典格式(与 create_pipeline_from_config 的入参格式一致)"""
84
+ config = {}
85
+
86
+ # Source 配置
87
+ source_type = type(self.source).__name__.replace('Source', '').lower()
88
+ config['source'] = {'type': source_type}
89
+
90
+ if isinstance(self.source, S3Source):
91
+ config['source'].update({
92
+ 'endpoint': self.source.endpoint,
93
+ 'bucket': self.source.bucket,
94
+ 'prefix': self.source.prefix,
95
+ 'pattern': self.source.pattern,
96
+ 'recursive': self.source.recursive
97
+ })
98
+ # access_key 和 secret_key 不在对象中保存,无法恢复
99
+ # region 也不在对象中保存,使用默认值
100
+ config['source']['region'] = 'us-east-1' # 默认值
101
+ elif isinstance(self.source, LocalSource):
102
+ config['source'].update({
103
+ 'directory': str(self.source.directory),
104
+ 'pattern': self.source.pattern,
105
+ 'recursive': self.source.recursive
106
+ })
107
+ elif isinstance(self.source, FtpSource):
108
+ config['source'].update({
109
+ 'host': self.source.host,
110
+ 'port': self.source.port,
111
+ 'username': self.source.username,
112
+ 'pattern': self.source.pattern,
113
+ 'recursive': self.source.recursive
114
+ })
115
+ # password 不在对象中保存,无法恢复
116
+ elif isinstance(self.source, SmbSource):
117
+ config['source'].update({
118
+ 'host': self.source.host,
119
+ 'share_name': self.source.share_name,
120
+ 'username': self.source.username,
121
+ 'domain': self.source.domain,
122
+ 'port': self.source.port,
123
+ 'path': self.source.path,
124
+ 'pattern': self.source.pattern,
125
+ 'recursive': self.source.recursive
126
+ })
127
+ # password 不在对象中保存,无法恢复
128
+
129
+ # Destination 配置
130
+ dest_type = type(self.destination).__name__.replace('Destination', '').lower()
131
+ # MilvusDestination 和 Zilliz 都使用 'milvus' 或 'zilliz' 类型
132
+ if dest_type == 'milvus':
133
+ # 判断是本地 Milvus 还是 Zilliz(通过 db_path 判断)
134
+ if self.destination.db_path.startswith('http'):
135
+ dest_type = 'zilliz'
136
+ else:
137
+ dest_type = 'milvus'
138
+
139
+ config['destination'] = {'type': dest_type}
140
+
141
+ if isinstance(self.destination, MilvusDestination):
142
+ config['destination'].update({
143
+ 'db_path': self.destination.db_path,
144
+ 'collection_name': self.destination.collection_name,
145
+ 'dimension': self.destination.dimension
146
+ })
147
+ # api_key 和 token 不在对象中保存,无法恢复
148
+ elif isinstance(self.destination, LocalDestination):
149
+ config['destination'].update({
150
+ 'output_dir': str(self.destination.output_dir)
151
+ })
152
+ elif isinstance(self.destination, S3Destination):
153
+ config['destination'].update({
154
+ 'endpoint': self.destination.endpoint,
155
+ 'bucket': self.destination.bucket,
156
+ 'prefix': self.destination.prefix
157
+ })
158
+ # access_key, secret_key, region 不在对象中保存,无法恢复
159
+ config['destination']['region'] = 'us-east-1' # 默认值
160
+
161
+ # API 配置
162
+ config['api_base_url'] = self.api_base_url
163
+ config['api_headers'] = {}
164
+ for key, value in self.api_headers.items():
165
+ config['api_headers'][key] = value
166
+
167
+ # Stages 配置
168
+ config['stages'] = []
169
+ for stage in self.stages:
170
+ stage_dict = {
171
+ 'type': stage.type,
172
+ 'config': {}
173
+ }
174
+
175
+ if isinstance(stage.config, ParseConfig):
176
+ stage_dict['config'] = stage.config.to_dict()
177
+ elif isinstance(stage.config, ChunkConfig):
178
+ stage_dict['config'] = stage.config.to_dict()
179
+ elif isinstance(stage.config, EmbedConfig):
180
+ stage_dict['config'] = stage.config.to_dict()
181
+ else:
182
+ # 如果 config 是字典或其他类型,尝试转换
183
+ if isinstance(stage.config, dict):
184
+ stage_dict['config'] = stage.config
185
+ else:
186
+ stage_dict['config'] = str(stage.config)
187
+
188
+ config['stages'].append(stage_dict)
189
+
190
+ # Pipeline Config
191
+ if self.pipeline_config.include_intermediate_results:
192
+ config['pipeline_config'] = {
193
+ 'include_intermediate_results': True,
194
+ 'intermediate_results_destination': {}
195
+ }
196
+
197
+ inter_dest = self.pipeline_config.intermediate_results_destination
198
+ if inter_dest:
199
+ inter_dest_type = type(inter_dest).__name__.replace('Destination', '').lower()
200
+ config['pipeline_config']['intermediate_results_destination']['type'] = inter_dest_type
201
+
202
+ if isinstance(inter_dest, LocalDestination):
203
+ config['pipeline_config']['intermediate_results_destination']['output_dir'] = str(inter_dest.output_dir)
204
+ elif isinstance(inter_dest, S3Destination):
205
+ config['pipeline_config']['intermediate_results_destination'].update({
206
+ 'endpoint': inter_dest.endpoint,
207
+ 'bucket': inter_dest.bucket,
208
+ 'prefix': inter_dest.prefix
209
+ })
210
+ # access_key, secret_key, region 不在对象中保存,无法恢复
211
+ config['pipeline_config']['intermediate_results_destination']['region'] = 'us-east-1' # 默认值
212
+
213
+ return config
214
+
82
215
  def _call_pipeline_api(self, file_bytes: bytes, filename: str, data_source: Dict[str, Any]) -> Optional[Dict[str, Any]]:
83
216
  url = f"{self.api_base_url}/pipeline"
84
217
  max_retries = 3
@@ -228,7 +361,6 @@ class Pipeline:
228
361
  print(f" → 读取文件...")
229
362
  file_bytes, data_source = self.source.read_file(file_path)
230
363
  data_source = data_source or {}
231
- print(f" data_source: {data_source}")
232
364
  # 转换为毫秒时间戳字符串
233
365
  timestamp_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
234
366
  data_source['date_processed'] = str(timestamp_ms)
@@ -103,7 +103,6 @@ class S3Source(Source):
103
103
  params['Delimiter'] = '/'
104
104
 
105
105
  for page in paginator.paginate(**params):
106
- print(page)
107
106
  if 'Contents' in page:
108
107
  for obj in page['Contents']:
109
108
  key = obj['Key']
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.6
3
+ Version: 0.2.8
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -517,298 +517,357 @@ Parse 参数中有必填项`Provider`,表示文档解析服务的供应商,
517
517
 
518
518
  ## 💡 使用示例
519
519
 
520
- ### 示例 1: 使用 config 字典配置(推荐)
520
+ ### 示例 1: 手动创建 Pipeline(推荐)
521
521
 
522
522
  ```python
523
- from xparse_client import create_pipeline_from_config
523
+ from xparse_client import (
524
+ Pipeline, S3Source, MilvusDestination,
525
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
526
+ )
524
527
 
525
- # 完整的配置示例
526
- config = {
527
- # S3 数据源配置
528
- 'source': {
529
- 'type': 's3',
530
- 'endpoint': 'https://your-minio.com',
531
- 'access_key': 'your-access-key',
532
- 'secret_key': 'your-secret-key',
533
- 'bucket': 'documents',
534
- 'prefix': 'pdfs/',
535
- 'region': 'us-east-1',
536
- 'pattern': '*.pdf' # 仅处理匹配的文件
537
- },
538
-
539
- # Milvus 目的地配置
540
- 'destination': {
541
- 'type': 'milvus',
542
- 'db_path': './vectors.db',
543
- 'collection_name': 'documents',
544
- 'dimension': 1024
545
- },
546
-
547
- # API 配置
548
- 'api_base_url': 'https://api.textin.com/api/xparse',
549
- 'api_headers': {
528
+ # 创建数据源
529
+ source = S3Source(
530
+ endpoint='https://your-minio.com',
531
+ access_key='your-access-key',
532
+ secret_key='your-secret-key',
533
+ bucket='documents',
534
+ prefix='pdfs/',
535
+ region='us-east-1',
536
+ pattern='*.pdf', # 仅处理匹配的文件
537
+ recursive=False # 不递归子目录
538
+ )
539
+
540
+ # 创建目的地
541
+ destination = MilvusDestination(
542
+ db_path='./vectors.db',
543
+ collection_name='documents',
544
+ dimension=1024
545
+ )
546
+
547
+ # 配置处理阶段
548
+ stages = [
549
+ Stage(
550
+ type='parse',
551
+ config=ParseConfig(provider='textin')
552
+ ),
553
+ Stage(
554
+ type='chunk',
555
+ config=ChunkConfig(
556
+ strategy='by_title', # 按标题分块
557
+ include_orig_elements=False,
558
+ new_after_n_chars=512,
559
+ max_characters=1024,
560
+ overlap=50 # 块之间重叠 50 字符
561
+ )
562
+ ),
563
+ Stage(
564
+ type='embed',
565
+ config=EmbedConfig(
566
+ provider='qwen',
567
+ model_name='text-embedding-v3'
568
+ )
569
+ )
570
+ ]
571
+
572
+ # 创建并运行 Pipeline
573
+ pipeline = Pipeline(
574
+ source=source,
575
+ destination=destination,
576
+ api_base_url='https://api.textin.com/api/xparse',
577
+ api_headers={
550
578
  'x-ti-app-id': 'your-app-id',
551
579
  'x-ti-secret-code': 'your-secret-code'
552
580
  },
553
-
554
- # Stages 配置
555
- 'stages': [
556
- {
557
- 'type': 'parse',
558
- 'config': {
559
- 'provider': 'textin'
560
- }
561
- },
562
- {
563
- 'type': 'chunk',
564
- 'config': {
565
- 'strategy': 'by_title', # 按标题分块
566
- 'include_orig_elements': False,
567
- 'new_after_n_chars': 512,
568
- 'max_characters': 1024,
569
- 'overlap': 50 # 块之间重叠 50 字符
570
- }
571
- },
572
- {
573
- 'type': 'embed',
574
- 'config': {
575
- 'provider': 'qwen',
576
- 'model_name': 'text-embedding-v3'
577
- }
578
- }
579
- ]
580
- }
581
+ stages=stages
582
+ )
581
583
 
582
- # 使用配置创建并运行 pipeline
583
- pipeline = create_pipeline_from_config(config)
584
584
  pipeline.run()
585
585
  ```
586
586
 
587
- ### 示例 2: 本地到本地(测试)
587
+ ### 示例 1.1: 输出配置字典
588
+
589
+ 手动创建 Pipeline 后,可以使用 `get_config()` 方法获取配置字典:
588
590
 
589
591
  ```python
590
- from datetime import datetime, timezone
591
- from xparse_client import create_pipeline_from_config
592
+ from xparse_client import (
593
+ Pipeline, LocalSource, LocalDestination,
594
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
595
+ )
592
596
 
593
- config = {
594
- 'source': {
595
- 'type': 'local',
596
- 'directory': './test_files',
597
- 'pattern': '*.pdf'
598
- },
599
- 'destination': {
600
- 'type': 'local',
601
- 'output_dir': './test_output'
597
+ # 手动创建 Pipeline
598
+ source = LocalSource(
599
+ directory='./test_files',
600
+ pattern='*.pdf',
601
+ recursive=False
602
+ )
603
+
604
+ destination = LocalDestination(output_dir='./test_output')
605
+
606
+ stages = [
607
+ Stage(type='parse', config=ParseConfig(provider='textin')),
608
+ Stage(type='chunk', config=ChunkConfig(strategy='basic', max_characters=1024)),
609
+ Stage(type='embed', config=EmbedConfig(provider='qwen', model_name='text-embedding-v3'))
610
+ ]
611
+
612
+ pipeline = Pipeline(
613
+ source=source,
614
+ destination=destination,
615
+ api_base_url='https://api.textin.com/api/xparse',
616
+ api_headers={
617
+ 'x-ti-app-id': 'your-app-id',
618
+ 'x-ti-secret-code': 'your-secret-code'
602
619
  },
603
- 'api_base_url': 'https://api.textin.com/api/xparse',
604
- 'api_headers': {
620
+ stages=stages
621
+ )
622
+
623
+ # 获取配置字典(格式与 create_pipeline_from_config 的入参一致)
624
+ config_dict = pipeline.get_config()
625
+
626
+ # 可以保存为 JSON 文件
627
+ import json
628
+ with open('pipeline_config.json', 'w', encoding='utf-8') as f:
629
+ json.dump(config_dict, f, indent=2, ensure_ascii=False)
630
+
631
+ # 或者用于创建新的 Pipeline(需要补充敏感信息如 access_key, secret_key 等)
632
+ # from xparse_client import create_pipeline_from_config
633
+ # new_pipeline = create_pipeline_from_config(config_dict)
634
+ ```
635
+
636
+ ### 示例 2: 本地到本地(测试)
637
+
638
+ ```python
639
+ from xparse_client import (
640
+ Pipeline, LocalSource, LocalDestination,
641
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
642
+ )
643
+
644
+ # 创建本地数据源
645
+ source = LocalSource(
646
+ directory='./test_files',
647
+ pattern='*.pdf',
648
+ recursive=False
649
+ )
650
+
651
+ # 创建本地输出目的地
652
+ destination = LocalDestination(output_dir='./test_output')
653
+
654
+ # 配置处理阶段
655
+ stages = [
656
+ Stage(
657
+ type='parse',
658
+ config=ParseConfig(provider='textin')
659
+ ),
660
+ Stage(
661
+ type='chunk',
662
+ config=ChunkConfig(
663
+ strategy='basic',
664
+ max_characters=1024
665
+ )
666
+ ),
667
+ Stage(
668
+ type='embed',
669
+ config=EmbedConfig(
670
+ provider='qwen',
671
+ model_name='text-embedding-v3'
672
+ )
673
+ )
674
+ ]
675
+
676
+ # 创建并运行 Pipeline
677
+ pipeline = Pipeline(
678
+ source=source,
679
+ destination=destination,
680
+ api_base_url='https://api.textin.com/api/xparse',
681
+ api_headers={
605
682
  'x-ti-app-id': 'your-app-id',
606
683
  'x-ti-secret-code': 'your-secret-code'
607
684
  },
608
- # Stages 配置
609
- 'stages': [
610
- {
611
- 'type': 'parse',
612
- 'config': {
613
- 'provider': 'textin'
614
- }
615
- },
616
- {
617
- 'type': 'chunk',
618
- 'config': {
619
- 'strategy': 'basic',
620
- 'max_characters': 1024
621
- }
622
- },
623
- {
624
- 'type': 'embed',
625
- 'config': {
626
- 'provider': 'qwen',
627
- 'model_name': 'text-embedding-v3'
628
- }
629
- }
630
- ]
631
- }
685
+ stages=stages
686
+ )
632
687
 
633
- pipeline = create_pipeline_from_config(config)
634
688
  pipeline.run()
635
689
  ```
636
690
 
637
691
  ### 示例 3: 不同分块策略的配置
638
692
 
639
693
  ```python
640
- from xparse_client import create_pipeline_from_config
694
+ from xparse_client import (
695
+ Pipeline, S3Source, MilvusDestination,
696
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
697
+ )
698
+
699
+ # 创建数据源和目的地
700
+ source = S3Source(...)
701
+ destination = MilvusDestination(...)
641
702
 
642
703
  # 配置 1:按页面分块(适合 PDF 文档)
643
- config_by_page = {
644
- 'source': {...},
645
- 'destination': {...},
646
- 'api_base_url': 'https://api.textin.com/api/xparse',
647
- 'api_headers': {...},
648
- 'stages': [
649
- {
650
- 'type': 'parse',
651
- 'config': {
652
- 'provider': 'textin'
653
- }
654
- },
655
- {
656
- 'type': 'chunk',
657
- 'config': {
658
- 'strategy': 'by_page', # 按页面分块
659
- 'max_characters': 2048, # 增大块大小
660
- 'overlap': 100 # 页面间重叠 100 字符
661
- }
662
- },
663
- {
664
- 'type': 'embed',
665
- 'config': {
666
- 'provider': 'qwen',
667
- 'model_name': 'text-embedding-v4' # 使用更高精度的模型
668
- }
669
- }
670
- ]
671
- }
704
+ stages_by_page = [
705
+ Stage(
706
+ type='parse',
707
+ config=ParseConfig(provider='textin')
708
+ ),
709
+ Stage(
710
+ type='chunk',
711
+ config=ChunkConfig(
712
+ strategy='by_page', # 按页面分块
713
+ max_characters=2048, # 增大块大小
714
+ overlap=100 # 页面间重叠 100 字符
715
+ )
716
+ ),
717
+ Stage(
718
+ type='embed',
719
+ config=EmbedConfig(
720
+ provider='qwen',
721
+ model_name='text-embedding-v4' # 使用更高精度的模型
722
+ )
723
+ )
724
+ ]
672
725
 
673
726
  # 配置 2:按标题分块(适合结构化文档)
674
- config_by_title = {
675
- 'source': {...},
676
- 'destination': {...},
677
- 'api_base_url': 'https://api.textin.com/api/xparse',
678
- 'api_headers': {...},
679
- 'stages': [
680
- {
681
- 'type': 'parse',
682
- 'config': {
683
- 'provider': 'textin'
684
- }
685
- },
686
- {
687
- 'type': 'chunk',
688
- 'config': {
689
- 'strategy': 'by_title', # 按标题分块
690
- 'include_orig_elements': True, # 保留原始元素信息
691
- 'max_characters': 1536
692
- }
693
- },
694
- {
695
- 'type': 'embed',
696
- 'config': {
697
- 'provider': 'qwen',
698
- 'model_name': 'text-embedding-v3'
699
- }
700
- }
701
- ]
702
- }
727
+ stages_by_title = [
728
+ Stage(
729
+ type='parse',
730
+ config=ParseConfig(provider='textin')
731
+ ),
732
+ Stage(
733
+ type='chunk',
734
+ config=ChunkConfig(
735
+ strategy='by_title', # 按标题分块
736
+ include_orig_elements=True, # 保留原始元素信息
737
+ max_characters=1536
738
+ )
739
+ ),
740
+ Stage(
741
+ type='embed',
742
+ config=EmbedConfig(
743
+ provider='qwen',
744
+ model_name='text-embedding-v3'
745
+ )
746
+ )
747
+ ]
703
748
 
704
749
  # 根据文档类型选择配置
705
- pipeline = create_pipeline_from_config(config_by_page)
750
+ pipeline = Pipeline(
751
+ source=source,
752
+ destination=destination,
753
+ api_base_url='https://api.textin.com/api/xparse',
754
+ api_headers={...},
755
+ stages=stages_by_page # 或 stages_by_title
756
+ )
706
757
  pipeline.run()
707
758
  ```
708
759
 
709
760
  ### 示例 4: FTP 数据源配置
710
761
 
711
762
  ```python
712
- from xparse_client import create_pipeline_from_config
763
+ from xparse_client import (
764
+ Pipeline, FtpSource, MilvusDestination,
765
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
766
+ )
713
767
 
714
- config = {
715
- # FTP 数据源
716
- 'source': {
717
- 'type': 'ftp',
718
- 'host': 'ftp.example.com',
719
- 'port': 21,
720
- 'username': 'user',
721
- 'password': 'pass'
722
- },
723
-
724
- # Milvus 目的地
725
- 'destination': {
726
- 'type': 'milvus',
727
- 'db_path': './vectors.db',
728
- 'collection_name': 'ftp_docs',
729
- 'dimension': 1024
730
- },
731
-
732
- 'api_base_url': 'https://api.textin.com/api/xparse',
733
- 'api_headers': {
768
+ # 创建 FTP 数据源
769
+ source = FtpSource(
770
+ host='ftp.example.com',
771
+ port=21,
772
+ username='user',
773
+ password='pass',
774
+ pattern='*.pdf',
775
+ recursive=False
776
+ )
777
+
778
+ # 创建 Milvus 目的地
779
+ destination = MilvusDestination(
780
+ db_path='./vectors.db',
781
+ collection_name='ftp_docs',
782
+ dimension=1024
783
+ )
784
+
785
+ # 配置处理阶段
786
+ stages = [
787
+ Stage(
788
+ type='parse',
789
+ config=ParseConfig(provider='textin')
790
+ ),
791
+ Stage(
792
+ type='chunk',
793
+ config=ChunkConfig(
794
+ strategy='basic',
795
+ max_characters=1024
796
+ )
797
+ ),
798
+ Stage(
799
+ type='embed',
800
+ config=EmbedConfig(
801
+ provider='qwen',
802
+ model_name='text-embedding-v3'
803
+ )
804
+ )
805
+ ]
806
+
807
+ # 创建并运行 Pipeline
808
+ pipeline = Pipeline(
809
+ source=source,
810
+ destination=destination,
811
+ api_base_url='https://api.textin.com/api/xparse',
812
+ api_headers={
734
813
  'x-ti-app-id': 'app-id',
735
814
  'x-ti-secret-code': 'secret'
736
815
  },
737
-
738
- # Stages 配置
739
- 'stages': [
740
- {
741
- 'type': 'parse',
742
- 'config': {
743
- 'provider': 'textin'
744
- }
745
- },
746
- {
747
- 'type': 'chunk',
748
- 'config': {
749
- 'strategy': 'basic',
750
- 'max_characters': 1024
751
- }
752
- },
753
- {
754
- 'type': 'embed',
755
- 'config': {
756
- 'provider': 'qwen',
757
- 'model_name': 'text-embedding-v3'
758
- }
759
- }
760
- ]
761
- }
816
+ stages=stages
817
+ )
762
818
 
763
- pipeline = create_pipeline_from_config(config)
764
819
  pipeline.run()
765
820
  ```
766
821
 
767
822
  ### 示例 5: 获取处理统计信息
768
823
 
769
824
  ```python
770
- from xparse_client import create_pipeline_from_config
825
+ from datetime import datetime, timezone
826
+ from xparse_client import (
827
+ Pipeline, LocalSource, LocalDestination,
828
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
829
+ )
771
830
 
772
- config = {
773
- 'source': {
774
- 'type': 'local',
775
- 'directory': './docs',
776
- 'pattern': '*.pdf'
777
- },
778
- 'destination': {
779
- 'type': 'local',
780
- 'output_dir': './output'
781
- },
782
- 'api_base_url': 'https://api.textin.com/api/xparse',
783
- 'api_headers': {
831
+ # 创建 Pipeline
832
+ source = LocalSource(
833
+ directory='./docs',
834
+ pattern='*.pdf',
835
+ recursive=False
836
+ )
837
+
838
+ destination = LocalDestination(output_dir='./output')
839
+
840
+ stages = [
841
+ Stage(
842
+ type='parse',
843
+ config=ParseConfig(provider='textin')
844
+ ),
845
+ Stage(
846
+ type='chunk',
847
+ config=ChunkConfig(
848
+ strategy='basic',
849
+ max_characters=1024
850
+ )
851
+ ),
852
+ Stage(
853
+ type='embed',
854
+ config=EmbedConfig(
855
+ provider='qwen',
856
+ model_name='text-embedding-v3'
857
+ )
858
+ )
859
+ ]
860
+
861
+ pipeline = Pipeline(
862
+ source=source,
863
+ destination=destination,
864
+ api_base_url='https://api.textin.com/api/xparse',
865
+ api_headers={
784
866
  'x-ti-app-id': 'your-app-id',
785
867
  'x-ti-secret-code': 'your-secret-code'
786
868
  },
787
- 'stages': [
788
- {
789
- 'type': 'parse',
790
- 'config': {
791
- 'provider': 'textin'
792
- }
793
- },
794
- {
795
- 'type': 'chunk',
796
- 'config': {
797
- 'strategy': 'basic',
798
- 'max_characters': 1024
799
- }
800
- },
801
- {
802
- 'type': 'embed',
803
- 'config': {
804
- 'provider': 'qwen',
805
- 'model_name': 'text-embedding-v3'
806
- }
807
- }
808
- ]
809
- }
810
-
811
- pipeline = create_pipeline_from_config(config)
869
+ stages=stages
870
+ )
812
871
 
813
872
  # 处理单个文件并获取统计信息
814
873
  file_bytes, data_source = pipeline.source.read_file('document.pdf')
@@ -0,0 +1,13 @@
1
+ example/run_pipeline.py,sha256=ybAWBPXcQClRk1HOMySLi9IUPIs1Qn-S5HXNLbNJHjs,15459
2
+ example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
+ xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
+ xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
+ xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
6
+ xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
7
+ xparse_client/pipeline/pipeline.py,sha256=IRTxN4YUJi9Wrm1G1ysGvcwsPsGh0inbquBH3nWYmAA,26477
8
+ xparse_client/pipeline/sources.py,sha256=UeVbWv6n0wQkIZIBBhrFCiyydQX7cvwmkoMgcf12p9g,19940
9
+ xparse_client-0.2.8.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
+ xparse_client-0.2.8.dist-info/METADATA,sha256=LX8TfLSbFZerGPhh16x5QK1lwrPh55CYKNLhr2kdBcY,27850
11
+ xparse_client-0.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ xparse_client-0.2.8.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
+ xparse_client-0.2.8.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- example/run_pipeline.py,sha256=ijws5q_vMmV0-bMHuFtOUMrEnxnL1LvOBCtcCD2c8zc,15366
2
- example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
- xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
- xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
- xparse_client/pipeline/config.py,sha256=FFYq2a0dBWBEj70s2aInXOiQ5MwwHimd6SI2_tkp52w,4138
6
- xparse_client/pipeline/destinations.py,sha256=F0z1AgVIBOn0m32i4l7LCMkJE0IbBdlpykO_at_wLaE,11931
7
- xparse_client/pipeline/pipeline.py,sha256=BEC1kf5HKn4qIiNMe5QPNOs9PJcjaE_4qRK0I9yaLSQ,20430
8
- xparse_client/pipeline/sources.py,sha256=0m2GFYCQTORSkbn_fWIWrOOSFes4Aso6oyVJDHfDeqc,19964
9
- xparse_client-0.2.6.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
- xparse_client-0.2.6.dist-info/METADATA,sha256=-VNzfb68RsczdETZ17vRhGzClj2X5mrwiwT5-HzK1XE,26805
11
- xparse_client-0.2.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- xparse_client-0.2.6.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
- xparse_client-0.2.6.dist-info/RECORD,,