xparse-client 0.2.6__tar.gz → 0.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.6
3
+ Version: 0.2.8
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -517,298 +517,357 @@ Parse 参数中有必填项`Provider`,表示文档解析服务的供应商,
517
517
 
518
518
  ## 💡 使用示例
519
519
 
520
- ### 示例 1: 使用 config 字典配置(推荐)
520
+ ### 示例 1: 手动创建 Pipeline(推荐)
521
521
 
522
522
  ```python
523
- from xparse_client import create_pipeline_from_config
523
+ from xparse_client import (
524
+ Pipeline, S3Source, MilvusDestination,
525
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
526
+ )
524
527
 
525
- # 完整的配置示例
526
- config = {
527
- # S3 数据源配置
528
- 'source': {
529
- 'type': 's3',
530
- 'endpoint': 'https://your-minio.com',
531
- 'access_key': 'your-access-key',
532
- 'secret_key': 'your-secret-key',
533
- 'bucket': 'documents',
534
- 'prefix': 'pdfs/',
535
- 'region': 'us-east-1',
536
- 'pattern': '*.pdf' # 仅处理匹配的文件
537
- },
538
-
539
- # Milvus 目的地配置
540
- 'destination': {
541
- 'type': 'milvus',
542
- 'db_path': './vectors.db',
543
- 'collection_name': 'documents',
544
- 'dimension': 1024
545
- },
546
-
547
- # API 配置
548
- 'api_base_url': 'https://api.textin.com/api/xparse',
549
- 'api_headers': {
528
+ # 创建数据源
529
+ source = S3Source(
530
+ endpoint='https://your-minio.com',
531
+ access_key='your-access-key',
532
+ secret_key='your-secret-key',
533
+ bucket='documents',
534
+ prefix='pdfs/',
535
+ region='us-east-1',
536
+ pattern='*.pdf', # 仅处理匹配的文件
537
+ recursive=False # 不递归子目录
538
+ )
539
+
540
+ # 创建目的地
541
+ destination = MilvusDestination(
542
+ db_path='./vectors.db',
543
+ collection_name='documents',
544
+ dimension=1024
545
+ )
546
+
547
+ # 配置处理阶段
548
+ stages = [
549
+ Stage(
550
+ type='parse',
551
+ config=ParseConfig(provider='textin')
552
+ ),
553
+ Stage(
554
+ type='chunk',
555
+ config=ChunkConfig(
556
+ strategy='by_title', # 按标题分块
557
+ include_orig_elements=False,
558
+ new_after_n_chars=512,
559
+ max_characters=1024,
560
+ overlap=50 # 块之间重叠 50 字符
561
+ )
562
+ ),
563
+ Stage(
564
+ type='embed',
565
+ config=EmbedConfig(
566
+ provider='qwen',
567
+ model_name='text-embedding-v3'
568
+ )
569
+ )
570
+ ]
571
+
572
+ # 创建并运行 Pipeline
573
+ pipeline = Pipeline(
574
+ source=source,
575
+ destination=destination,
576
+ api_base_url='https://api.textin.com/api/xparse',
577
+ api_headers={
550
578
  'x-ti-app-id': 'your-app-id',
551
579
  'x-ti-secret-code': 'your-secret-code'
552
580
  },
553
-
554
- # Stages 配置
555
- 'stages': [
556
- {
557
- 'type': 'parse',
558
- 'config': {
559
- 'provider': 'textin'
560
- }
561
- },
562
- {
563
- 'type': 'chunk',
564
- 'config': {
565
- 'strategy': 'by_title', # 按标题分块
566
- 'include_orig_elements': False,
567
- 'new_after_n_chars': 512,
568
- 'max_characters': 1024,
569
- 'overlap': 50 # 块之间重叠 50 字符
570
- }
571
- },
572
- {
573
- 'type': 'embed',
574
- 'config': {
575
- 'provider': 'qwen',
576
- 'model_name': 'text-embedding-v3'
577
- }
578
- }
579
- ]
580
- }
581
+ stages=stages
582
+ )
581
583
 
582
- # 使用配置创建并运行 pipeline
583
- pipeline = create_pipeline_from_config(config)
584
584
  pipeline.run()
585
585
  ```
586
586
 
587
- ### 示例 2: 本地到本地(测试)
587
+ ### 示例 1.1: 输出配置字典
588
+
589
+ 手动创建 Pipeline 后,可以使用 `get_config()` 方法获取配置字典:
588
590
 
589
591
  ```python
590
- from datetime import datetime, timezone
591
- from xparse_client import create_pipeline_from_config
592
+ from xparse_client import (
593
+ Pipeline, LocalSource, LocalDestination,
594
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
595
+ )
592
596
 
593
- config = {
594
- 'source': {
595
- 'type': 'local',
596
- 'directory': './test_files',
597
- 'pattern': '*.pdf'
598
- },
599
- 'destination': {
600
- 'type': 'local',
601
- 'output_dir': './test_output'
597
+ # 手动创建 Pipeline
598
+ source = LocalSource(
599
+ directory='./test_files',
600
+ pattern='*.pdf',
601
+ recursive=False
602
+ )
603
+
604
+ destination = LocalDestination(output_dir='./test_output')
605
+
606
+ stages = [
607
+ Stage(type='parse', config=ParseConfig(provider='textin')),
608
+ Stage(type='chunk', config=ChunkConfig(strategy='basic', max_characters=1024)),
609
+ Stage(type='embed', config=EmbedConfig(provider='qwen', model_name='text-embedding-v3'))
610
+ ]
611
+
612
+ pipeline = Pipeline(
613
+ source=source,
614
+ destination=destination,
615
+ api_base_url='https://api.textin.com/api/xparse',
616
+ api_headers={
617
+ 'x-ti-app-id': 'your-app-id',
618
+ 'x-ti-secret-code': 'your-secret-code'
602
619
  },
603
- 'api_base_url': 'https://api.textin.com/api/xparse',
604
- 'api_headers': {
620
+ stages=stages
621
+ )
622
+
623
+ # 获取配置字典(格式与 create_pipeline_from_config 的入参一致)
624
+ config_dict = pipeline.get_config()
625
+
626
+ # 可以保存为 JSON 文件
627
+ import json
628
+ with open('pipeline_config.json', 'w', encoding='utf-8') as f:
629
+ json.dump(config_dict, f, indent=2, ensure_ascii=False)
630
+
631
+ # 或者用于创建新的 Pipeline(需要补充敏感信息如 access_key, secret_key 等)
632
+ # from xparse_client import create_pipeline_from_config
633
+ # new_pipeline = create_pipeline_from_config(config_dict)
634
+ ```
635
+
636
+ ### 示例 2: 本地到本地(测试)
637
+
638
+ ```python
639
+ from xparse_client import (
640
+ Pipeline, LocalSource, LocalDestination,
641
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
642
+ )
643
+
644
+ # 创建本地数据源
645
+ source = LocalSource(
646
+ directory='./test_files',
647
+ pattern='*.pdf',
648
+ recursive=False
649
+ )
650
+
651
+ # 创建本地输出目的地
652
+ destination = LocalDestination(output_dir='./test_output')
653
+
654
+ # 配置处理阶段
655
+ stages = [
656
+ Stage(
657
+ type='parse',
658
+ config=ParseConfig(provider='textin')
659
+ ),
660
+ Stage(
661
+ type='chunk',
662
+ config=ChunkConfig(
663
+ strategy='basic',
664
+ max_characters=1024
665
+ )
666
+ ),
667
+ Stage(
668
+ type='embed',
669
+ config=EmbedConfig(
670
+ provider='qwen',
671
+ model_name='text-embedding-v3'
672
+ )
673
+ )
674
+ ]
675
+
676
+ # 创建并运行 Pipeline
677
+ pipeline = Pipeline(
678
+ source=source,
679
+ destination=destination,
680
+ api_base_url='https://api.textin.com/api/xparse',
681
+ api_headers={
605
682
  'x-ti-app-id': 'your-app-id',
606
683
  'x-ti-secret-code': 'your-secret-code'
607
684
  },
608
- # Stages 配置
609
- 'stages': [
610
- {
611
- 'type': 'parse',
612
- 'config': {
613
- 'provider': 'textin'
614
- }
615
- },
616
- {
617
- 'type': 'chunk',
618
- 'config': {
619
- 'strategy': 'basic',
620
- 'max_characters': 1024
621
- }
622
- },
623
- {
624
- 'type': 'embed',
625
- 'config': {
626
- 'provider': 'qwen',
627
- 'model_name': 'text-embedding-v3'
628
- }
629
- }
630
- ]
631
- }
685
+ stages=stages
686
+ )
632
687
 
633
- pipeline = create_pipeline_from_config(config)
634
688
  pipeline.run()
635
689
  ```
636
690
 
637
691
  ### 示例 3: 不同分块策略的配置
638
692
 
639
693
  ```python
640
- from xparse_client import create_pipeline_from_config
694
+ from xparse_client import (
695
+ Pipeline, S3Source, MilvusDestination,
696
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
697
+ )
698
+
699
+ # 创建数据源和目的地
700
+ source = S3Source(...)
701
+ destination = MilvusDestination(...)
641
702
 
642
703
  # 配置 1:按页面分块(适合 PDF 文档)
643
- config_by_page = {
644
- 'source': {...},
645
- 'destination': {...},
646
- 'api_base_url': 'https://api.textin.com/api/xparse',
647
- 'api_headers': {...},
648
- 'stages': [
649
- {
650
- 'type': 'parse',
651
- 'config': {
652
- 'provider': 'textin'
653
- }
654
- },
655
- {
656
- 'type': 'chunk',
657
- 'config': {
658
- 'strategy': 'by_page', # 按页面分块
659
- 'max_characters': 2048, # 增大块大小
660
- 'overlap': 100 # 页面间重叠 100 字符
661
- }
662
- },
663
- {
664
- 'type': 'embed',
665
- 'config': {
666
- 'provider': 'qwen',
667
- 'model_name': 'text-embedding-v4' # 使用更高精度的模型
668
- }
669
- }
670
- ]
671
- }
704
+ stages_by_page = [
705
+ Stage(
706
+ type='parse',
707
+ config=ParseConfig(provider='textin')
708
+ ),
709
+ Stage(
710
+ type='chunk',
711
+ config=ChunkConfig(
712
+ strategy='by_page', # 按页面分块
713
+ max_characters=2048, # 增大块大小
714
+ overlap=100 # 页面间重叠 100 字符
715
+ )
716
+ ),
717
+ Stage(
718
+ type='embed',
719
+ config=EmbedConfig(
720
+ provider='qwen',
721
+ model_name='text-embedding-v4' # 使用更高精度的模型
722
+ )
723
+ )
724
+ ]
672
725
 
673
726
  # 配置 2:按标题分块(适合结构化文档)
674
- config_by_title = {
675
- 'source': {...},
676
- 'destination': {...},
677
- 'api_base_url': 'https://api.textin.com/api/xparse',
678
- 'api_headers': {...},
679
- 'stages': [
680
- {
681
- 'type': 'parse',
682
- 'config': {
683
- 'provider': 'textin'
684
- }
685
- },
686
- {
687
- 'type': 'chunk',
688
- 'config': {
689
- 'strategy': 'by_title', # 按标题分块
690
- 'include_orig_elements': True, # 保留原始元素信息
691
- 'max_characters': 1536
692
- }
693
- },
694
- {
695
- 'type': 'embed',
696
- 'config': {
697
- 'provider': 'qwen',
698
- 'model_name': 'text-embedding-v3'
699
- }
700
- }
701
- ]
702
- }
727
+ stages_by_title = [
728
+ Stage(
729
+ type='parse',
730
+ config=ParseConfig(provider='textin')
731
+ ),
732
+ Stage(
733
+ type='chunk',
734
+ config=ChunkConfig(
735
+ strategy='by_title', # 按标题分块
736
+ include_orig_elements=True, # 保留原始元素信息
737
+ max_characters=1536
738
+ )
739
+ ),
740
+ Stage(
741
+ type='embed',
742
+ config=EmbedConfig(
743
+ provider='qwen',
744
+ model_name='text-embedding-v3'
745
+ )
746
+ )
747
+ ]
703
748
 
704
749
  # 根据文档类型选择配置
705
- pipeline = create_pipeline_from_config(config_by_page)
750
+ pipeline = Pipeline(
751
+ source=source,
752
+ destination=destination,
753
+ api_base_url='https://api.textin.com/api/xparse',
754
+ api_headers={...},
755
+ stages=stages_by_page # 或 stages_by_title
756
+ )
706
757
  pipeline.run()
707
758
  ```
708
759
 
709
760
  ### 示例 4: FTP 数据源配置
710
761
 
711
762
  ```python
712
- from xparse_client import create_pipeline_from_config
763
+ from xparse_client import (
764
+ Pipeline, FtpSource, MilvusDestination,
765
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
766
+ )
713
767
 
714
- config = {
715
- # FTP 数据源
716
- 'source': {
717
- 'type': 'ftp',
718
- 'host': 'ftp.example.com',
719
- 'port': 21,
720
- 'username': 'user',
721
- 'password': 'pass'
722
- },
723
-
724
- # Milvus 目的地
725
- 'destination': {
726
- 'type': 'milvus',
727
- 'db_path': './vectors.db',
728
- 'collection_name': 'ftp_docs',
729
- 'dimension': 1024
730
- },
731
-
732
- 'api_base_url': 'https://api.textin.com/api/xparse',
733
- 'api_headers': {
768
+ # 创建 FTP 数据源
769
+ source = FtpSource(
770
+ host='ftp.example.com',
771
+ port=21,
772
+ username='user',
773
+ password='pass',
774
+ pattern='*.pdf',
775
+ recursive=False
776
+ )
777
+
778
+ # 创建 Milvus 目的地
779
+ destination = MilvusDestination(
780
+ db_path='./vectors.db',
781
+ collection_name='ftp_docs',
782
+ dimension=1024
783
+ )
784
+
785
+ # 配置处理阶段
786
+ stages = [
787
+ Stage(
788
+ type='parse',
789
+ config=ParseConfig(provider='textin')
790
+ ),
791
+ Stage(
792
+ type='chunk',
793
+ config=ChunkConfig(
794
+ strategy='basic',
795
+ max_characters=1024
796
+ )
797
+ ),
798
+ Stage(
799
+ type='embed',
800
+ config=EmbedConfig(
801
+ provider='qwen',
802
+ model_name='text-embedding-v3'
803
+ )
804
+ )
805
+ ]
806
+
807
+ # 创建并运行 Pipeline
808
+ pipeline = Pipeline(
809
+ source=source,
810
+ destination=destination,
811
+ api_base_url='https://api.textin.com/api/xparse',
812
+ api_headers={
734
813
  'x-ti-app-id': 'app-id',
735
814
  'x-ti-secret-code': 'secret'
736
815
  },
737
-
738
- # Stages 配置
739
- 'stages': [
740
- {
741
- 'type': 'parse',
742
- 'config': {
743
- 'provider': 'textin'
744
- }
745
- },
746
- {
747
- 'type': 'chunk',
748
- 'config': {
749
- 'strategy': 'basic',
750
- 'max_characters': 1024
751
- }
752
- },
753
- {
754
- 'type': 'embed',
755
- 'config': {
756
- 'provider': 'qwen',
757
- 'model_name': 'text-embedding-v3'
758
- }
759
- }
760
- ]
761
- }
816
+ stages=stages
817
+ )
762
818
 
763
- pipeline = create_pipeline_from_config(config)
764
819
  pipeline.run()
765
820
  ```
766
821
 
767
822
  ### 示例 5: 获取处理统计信息
768
823
 
769
824
  ```python
770
- from xparse_client import create_pipeline_from_config
825
+ from datetime import datetime, timezone
826
+ from xparse_client import (
827
+ Pipeline, LocalSource, LocalDestination,
828
+ ParseConfig, ChunkConfig, EmbedConfig, Stage
829
+ )
771
830
 
772
- config = {
773
- 'source': {
774
- 'type': 'local',
775
- 'directory': './docs',
776
- 'pattern': '*.pdf'
777
- },
778
- 'destination': {
779
- 'type': 'local',
780
- 'output_dir': './output'
781
- },
782
- 'api_base_url': 'https://api.textin.com/api/xparse',
783
- 'api_headers': {
831
+ # 创建 Pipeline
832
+ source = LocalSource(
833
+ directory='./docs',
834
+ pattern='*.pdf',
835
+ recursive=False
836
+ )
837
+
838
+ destination = LocalDestination(output_dir='./output')
839
+
840
+ stages = [
841
+ Stage(
842
+ type='parse',
843
+ config=ParseConfig(provider='textin')
844
+ ),
845
+ Stage(
846
+ type='chunk',
847
+ config=ChunkConfig(
848
+ strategy='basic',
849
+ max_characters=1024
850
+ )
851
+ ),
852
+ Stage(
853
+ type='embed',
854
+ config=EmbedConfig(
855
+ provider='qwen',
856
+ model_name='text-embedding-v3'
857
+ )
858
+ )
859
+ ]
860
+
861
+ pipeline = Pipeline(
862
+ source=source,
863
+ destination=destination,
864
+ api_base_url='https://api.textin.com/api/xparse',
865
+ api_headers={
784
866
  'x-ti-app-id': 'your-app-id',
785
867
  'x-ti-secret-code': 'your-secret-code'
786
868
  },
787
- 'stages': [
788
- {
789
- 'type': 'parse',
790
- 'config': {
791
- 'provider': 'textin'
792
- }
793
- },
794
- {
795
- 'type': 'chunk',
796
- 'config': {
797
- 'strategy': 'basic',
798
- 'max_characters': 1024
799
- }
800
- },
801
- {
802
- 'type': 'embed',
803
- 'config': {
804
- 'provider': 'qwen',
805
- 'model_name': 'text-embedding-v3'
806
- }
807
- }
808
- ]
809
- }
810
-
811
- pipeline = create_pipeline_from_config(config)
869
+ stages=stages
870
+ )
812
871
 
813
872
  # 处理单个文件并获取统计信息
814
873
  file_bytes, data_source = pipeline.source.read_file('document.pdf')