xparse-client 0.2.7__tar.gz → 0.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xparse_client-0.2.7 → xparse_client-0.2.8}/PKG-INFO +301 -242
- {xparse_client-0.2.7 → xparse_client-0.2.8}/README.md +300 -241
- {xparse_client-0.2.7 → xparse_client-0.2.8}/example/run_pipeline.py +4 -2
- {xparse_client-0.2.7 → xparse_client-0.2.8}/pyproject.toml +1 -1
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client/pipeline/pipeline.py +133 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client.egg-info/PKG-INFO +301 -242
- {xparse_client-0.2.7 → xparse_client-0.2.8}/LICENSE +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/example/run_pipeline_test.py +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/setup.cfg +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client/__init__.py +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client/pipeline/__init__.py +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client/pipeline/config.py +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client/pipeline/destinations.py +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client/pipeline/sources.py +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client.egg-info/SOURCES.txt +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client.egg-info/dependency_links.txt +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client.egg-info/requires.txt +0 -0
- {xparse_client-0.2.7 → xparse_client-0.2.8}/xparse_client.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xparse-client
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8
|
|
4
4
|
Summary: 面向Agent和RAG的新一代文档处理 AI Infra
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
|
|
@@ -517,298 +517,357 @@ Parse 参数中有必填项`Provider`,表示文档解析服务的供应商,
|
|
|
517
517
|
|
|
518
518
|
## 💡 使用示例
|
|
519
519
|
|
|
520
|
-
### 示例 1:
|
|
520
|
+
### 示例 1: 手动创建 Pipeline(推荐)
|
|
521
521
|
|
|
522
522
|
```python
|
|
523
|
-
from xparse_client import
|
|
523
|
+
from xparse_client import (
|
|
524
|
+
Pipeline, S3Source, MilvusDestination,
|
|
525
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
526
|
+
)
|
|
524
527
|
|
|
525
|
-
#
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
'
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
'
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
528
|
+
# 创建数据源
|
|
529
|
+
source = S3Source(
|
|
530
|
+
endpoint='https://your-minio.com',
|
|
531
|
+
access_key='your-access-key',
|
|
532
|
+
secret_key='your-secret-key',
|
|
533
|
+
bucket='documents',
|
|
534
|
+
prefix='pdfs/',
|
|
535
|
+
region='us-east-1',
|
|
536
|
+
pattern='*.pdf', # 仅处理匹配的文件
|
|
537
|
+
recursive=False # 不递归子目录
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# 创建目的地
|
|
541
|
+
destination = MilvusDestination(
|
|
542
|
+
db_path='./vectors.db',
|
|
543
|
+
collection_name='documents',
|
|
544
|
+
dimension=1024
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# 配置处理阶段
|
|
548
|
+
stages = [
|
|
549
|
+
Stage(
|
|
550
|
+
type='parse',
|
|
551
|
+
config=ParseConfig(provider='textin')
|
|
552
|
+
),
|
|
553
|
+
Stage(
|
|
554
|
+
type='chunk',
|
|
555
|
+
config=ChunkConfig(
|
|
556
|
+
strategy='by_title', # 按标题分块
|
|
557
|
+
include_orig_elements=False,
|
|
558
|
+
new_after_n_chars=512,
|
|
559
|
+
max_characters=1024,
|
|
560
|
+
overlap=50 # 块之间重叠 50 字符
|
|
561
|
+
)
|
|
562
|
+
),
|
|
563
|
+
Stage(
|
|
564
|
+
type='embed',
|
|
565
|
+
config=EmbedConfig(
|
|
566
|
+
provider='qwen',
|
|
567
|
+
model_name='text-embedding-v3'
|
|
568
|
+
)
|
|
569
|
+
)
|
|
570
|
+
]
|
|
571
|
+
|
|
572
|
+
# 创建并运行 Pipeline
|
|
573
|
+
pipeline = Pipeline(
|
|
574
|
+
source=source,
|
|
575
|
+
destination=destination,
|
|
576
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
577
|
+
api_headers={
|
|
550
578
|
'x-ti-app-id': 'your-app-id',
|
|
551
579
|
'x-ti-secret-code': 'your-secret-code'
|
|
552
580
|
},
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
'stages': [
|
|
556
|
-
{
|
|
557
|
-
'type': 'parse',
|
|
558
|
-
'config': {
|
|
559
|
-
'provider': 'textin'
|
|
560
|
-
}
|
|
561
|
-
},
|
|
562
|
-
{
|
|
563
|
-
'type': 'chunk',
|
|
564
|
-
'config': {
|
|
565
|
-
'strategy': 'by_title', # 按标题分块
|
|
566
|
-
'include_orig_elements': False,
|
|
567
|
-
'new_after_n_chars': 512,
|
|
568
|
-
'max_characters': 1024,
|
|
569
|
-
'overlap': 50 # 块之间重叠 50 字符
|
|
570
|
-
}
|
|
571
|
-
},
|
|
572
|
-
{
|
|
573
|
-
'type': 'embed',
|
|
574
|
-
'config': {
|
|
575
|
-
'provider': 'qwen',
|
|
576
|
-
'model_name': 'text-embedding-v3'
|
|
577
|
-
}
|
|
578
|
-
}
|
|
579
|
-
]
|
|
580
|
-
}
|
|
581
|
+
stages=stages
|
|
582
|
+
)
|
|
581
583
|
|
|
582
|
-
# 使用配置创建并运行 pipeline
|
|
583
|
-
pipeline = create_pipeline_from_config(config)
|
|
584
584
|
pipeline.run()
|
|
585
585
|
```
|
|
586
586
|
|
|
587
|
-
### 示例
|
|
587
|
+
### 示例 1.1: 输出配置字典
|
|
588
|
+
|
|
589
|
+
手动创建 Pipeline 后,可以使用 `get_config()` 方法获取配置字典:
|
|
588
590
|
|
|
589
591
|
```python
|
|
590
|
-
from
|
|
591
|
-
|
|
592
|
+
from xparse_client import (
|
|
593
|
+
Pipeline, LocalSource, LocalDestination,
|
|
594
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
595
|
+
)
|
|
592
596
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
597
|
+
# 手动创建 Pipeline
|
|
598
|
+
source = LocalSource(
|
|
599
|
+
directory='./test_files',
|
|
600
|
+
pattern='*.pdf',
|
|
601
|
+
recursive=False
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
destination = LocalDestination(output_dir='./test_output')
|
|
605
|
+
|
|
606
|
+
stages = [
|
|
607
|
+
Stage(type='parse', config=ParseConfig(provider='textin')),
|
|
608
|
+
Stage(type='chunk', config=ChunkConfig(strategy='basic', max_characters=1024)),
|
|
609
|
+
Stage(type='embed', config=EmbedConfig(provider='qwen', model_name='text-embedding-v3'))
|
|
610
|
+
]
|
|
611
|
+
|
|
612
|
+
pipeline = Pipeline(
|
|
613
|
+
source=source,
|
|
614
|
+
destination=destination,
|
|
615
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
616
|
+
api_headers={
|
|
617
|
+
'x-ti-app-id': 'your-app-id',
|
|
618
|
+
'x-ti-secret-code': 'your-secret-code'
|
|
602
619
|
},
|
|
603
|
-
|
|
604
|
-
|
|
620
|
+
stages=stages
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
# 获取配置字典(格式与 create_pipeline_from_config 的入参一致)
|
|
624
|
+
config_dict = pipeline.get_config()
|
|
625
|
+
|
|
626
|
+
# 可以保存为 JSON 文件
|
|
627
|
+
import json
|
|
628
|
+
with open('pipeline_config.json', 'w', encoding='utf-8') as f:
|
|
629
|
+
json.dump(config_dict, f, indent=2, ensure_ascii=False)
|
|
630
|
+
|
|
631
|
+
# 或者用于创建新的 Pipeline(需要补充敏感信息如 access_key, secret_key 等)
|
|
632
|
+
# from xparse_client import create_pipeline_from_config
|
|
633
|
+
# new_pipeline = create_pipeline_from_config(config_dict)
|
|
634
|
+
```
|
|
635
|
+
|
|
636
|
+
### 示例 2: 本地到本地(测试)
|
|
637
|
+
|
|
638
|
+
```python
|
|
639
|
+
from xparse_client import (
|
|
640
|
+
Pipeline, LocalSource, LocalDestination,
|
|
641
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
# 创建本地数据源
|
|
645
|
+
source = LocalSource(
|
|
646
|
+
directory='./test_files',
|
|
647
|
+
pattern='*.pdf',
|
|
648
|
+
recursive=False
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
# 创建本地输出目的地
|
|
652
|
+
destination = LocalDestination(output_dir='./test_output')
|
|
653
|
+
|
|
654
|
+
# 配置处理阶段
|
|
655
|
+
stages = [
|
|
656
|
+
Stage(
|
|
657
|
+
type='parse',
|
|
658
|
+
config=ParseConfig(provider='textin')
|
|
659
|
+
),
|
|
660
|
+
Stage(
|
|
661
|
+
type='chunk',
|
|
662
|
+
config=ChunkConfig(
|
|
663
|
+
strategy='basic',
|
|
664
|
+
max_characters=1024
|
|
665
|
+
)
|
|
666
|
+
),
|
|
667
|
+
Stage(
|
|
668
|
+
type='embed',
|
|
669
|
+
config=EmbedConfig(
|
|
670
|
+
provider='qwen',
|
|
671
|
+
model_name='text-embedding-v3'
|
|
672
|
+
)
|
|
673
|
+
)
|
|
674
|
+
]
|
|
675
|
+
|
|
676
|
+
# 创建并运行 Pipeline
|
|
677
|
+
pipeline = Pipeline(
|
|
678
|
+
source=source,
|
|
679
|
+
destination=destination,
|
|
680
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
681
|
+
api_headers={
|
|
605
682
|
'x-ti-app-id': 'your-app-id',
|
|
606
683
|
'x-ti-secret-code': 'your-secret-code'
|
|
607
684
|
},
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
{
|
|
611
|
-
'type': 'parse',
|
|
612
|
-
'config': {
|
|
613
|
-
'provider': 'textin'
|
|
614
|
-
}
|
|
615
|
-
},
|
|
616
|
-
{
|
|
617
|
-
'type': 'chunk',
|
|
618
|
-
'config': {
|
|
619
|
-
'strategy': 'basic',
|
|
620
|
-
'max_characters': 1024
|
|
621
|
-
}
|
|
622
|
-
},
|
|
623
|
-
{
|
|
624
|
-
'type': 'embed',
|
|
625
|
-
'config': {
|
|
626
|
-
'provider': 'qwen',
|
|
627
|
-
'model_name': 'text-embedding-v3'
|
|
628
|
-
}
|
|
629
|
-
}
|
|
630
|
-
]
|
|
631
|
-
}
|
|
685
|
+
stages=stages
|
|
686
|
+
)
|
|
632
687
|
|
|
633
|
-
pipeline = create_pipeline_from_config(config)
|
|
634
688
|
pipeline.run()
|
|
635
689
|
```
|
|
636
690
|
|
|
637
691
|
### 示例 3: 不同分块策略的配置
|
|
638
692
|
|
|
639
693
|
```python
|
|
640
|
-
from xparse_client import
|
|
694
|
+
from xparse_client import (
|
|
695
|
+
Pipeline, S3Source, MilvusDestination,
|
|
696
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
# 创建数据源和目的地
|
|
700
|
+
source = S3Source(...)
|
|
701
|
+
destination = MilvusDestination(...)
|
|
641
702
|
|
|
642
703
|
# 配置 1:按页面分块(适合 PDF 文档)
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
'
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
'type': 'embed',
|
|
665
|
-
'config': {
|
|
666
|
-
'provider': 'qwen',
|
|
667
|
-
'model_name': 'text-embedding-v4' # 使用更高精度的模型
|
|
668
|
-
}
|
|
669
|
-
}
|
|
670
|
-
]
|
|
671
|
-
}
|
|
704
|
+
stages_by_page = [
|
|
705
|
+
Stage(
|
|
706
|
+
type='parse',
|
|
707
|
+
config=ParseConfig(provider='textin')
|
|
708
|
+
),
|
|
709
|
+
Stage(
|
|
710
|
+
type='chunk',
|
|
711
|
+
config=ChunkConfig(
|
|
712
|
+
strategy='by_page', # 按页面分块
|
|
713
|
+
max_characters=2048, # 增大块大小
|
|
714
|
+
overlap=100 # 页面间重叠 100 字符
|
|
715
|
+
)
|
|
716
|
+
),
|
|
717
|
+
Stage(
|
|
718
|
+
type='embed',
|
|
719
|
+
config=EmbedConfig(
|
|
720
|
+
provider='qwen',
|
|
721
|
+
model_name='text-embedding-v4' # 使用更高精度的模型
|
|
722
|
+
)
|
|
723
|
+
)
|
|
724
|
+
]
|
|
672
725
|
|
|
673
726
|
# 配置 2:按标题分块(适合结构化文档)
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
'
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
'type': 'embed',
|
|
696
|
-
'config': {
|
|
697
|
-
'provider': 'qwen',
|
|
698
|
-
'model_name': 'text-embedding-v3'
|
|
699
|
-
}
|
|
700
|
-
}
|
|
701
|
-
]
|
|
702
|
-
}
|
|
727
|
+
stages_by_title = [
|
|
728
|
+
Stage(
|
|
729
|
+
type='parse',
|
|
730
|
+
config=ParseConfig(provider='textin')
|
|
731
|
+
),
|
|
732
|
+
Stage(
|
|
733
|
+
type='chunk',
|
|
734
|
+
config=ChunkConfig(
|
|
735
|
+
strategy='by_title', # 按标题分块
|
|
736
|
+
include_orig_elements=True, # 保留原始元素信息
|
|
737
|
+
max_characters=1536
|
|
738
|
+
)
|
|
739
|
+
),
|
|
740
|
+
Stage(
|
|
741
|
+
type='embed',
|
|
742
|
+
config=EmbedConfig(
|
|
743
|
+
provider='qwen',
|
|
744
|
+
model_name='text-embedding-v3'
|
|
745
|
+
)
|
|
746
|
+
)
|
|
747
|
+
]
|
|
703
748
|
|
|
704
749
|
# 根据文档类型选择配置
|
|
705
|
-
pipeline =
|
|
750
|
+
pipeline = Pipeline(
|
|
751
|
+
source=source,
|
|
752
|
+
destination=destination,
|
|
753
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
754
|
+
api_headers={...},
|
|
755
|
+
stages=stages_by_page # 或 stages_by_title
|
|
756
|
+
)
|
|
706
757
|
pipeline.run()
|
|
707
758
|
```
|
|
708
759
|
|
|
709
760
|
### 示例 4: FTP 数据源配置
|
|
710
761
|
|
|
711
762
|
```python
|
|
712
|
-
from xparse_client import
|
|
763
|
+
from xparse_client import (
|
|
764
|
+
Pipeline, FtpSource, MilvusDestination,
|
|
765
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
766
|
+
)
|
|
713
767
|
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
'
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
768
|
+
# 创建 FTP 数据源
|
|
769
|
+
source = FtpSource(
|
|
770
|
+
host='ftp.example.com',
|
|
771
|
+
port=21,
|
|
772
|
+
username='user',
|
|
773
|
+
password='pass',
|
|
774
|
+
pattern='*.pdf',
|
|
775
|
+
recursive=False
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
# 创建 Milvus 目的地
|
|
779
|
+
destination = MilvusDestination(
|
|
780
|
+
db_path='./vectors.db',
|
|
781
|
+
collection_name='ftp_docs',
|
|
782
|
+
dimension=1024
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
# 配置处理阶段
|
|
786
|
+
stages = [
|
|
787
|
+
Stage(
|
|
788
|
+
type='parse',
|
|
789
|
+
config=ParseConfig(provider='textin')
|
|
790
|
+
),
|
|
791
|
+
Stage(
|
|
792
|
+
type='chunk',
|
|
793
|
+
config=ChunkConfig(
|
|
794
|
+
strategy='basic',
|
|
795
|
+
max_characters=1024
|
|
796
|
+
)
|
|
797
|
+
),
|
|
798
|
+
Stage(
|
|
799
|
+
type='embed',
|
|
800
|
+
config=EmbedConfig(
|
|
801
|
+
provider='qwen',
|
|
802
|
+
model_name='text-embedding-v3'
|
|
803
|
+
)
|
|
804
|
+
)
|
|
805
|
+
]
|
|
806
|
+
|
|
807
|
+
# 创建并运行 Pipeline
|
|
808
|
+
pipeline = Pipeline(
|
|
809
|
+
source=source,
|
|
810
|
+
destination=destination,
|
|
811
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
812
|
+
api_headers={
|
|
734
813
|
'x-ti-app-id': 'app-id',
|
|
735
814
|
'x-ti-secret-code': 'secret'
|
|
736
815
|
},
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
'stages': [
|
|
740
|
-
{
|
|
741
|
-
'type': 'parse',
|
|
742
|
-
'config': {
|
|
743
|
-
'provider': 'textin'
|
|
744
|
-
}
|
|
745
|
-
},
|
|
746
|
-
{
|
|
747
|
-
'type': 'chunk',
|
|
748
|
-
'config': {
|
|
749
|
-
'strategy': 'basic',
|
|
750
|
-
'max_characters': 1024
|
|
751
|
-
}
|
|
752
|
-
},
|
|
753
|
-
{
|
|
754
|
-
'type': 'embed',
|
|
755
|
-
'config': {
|
|
756
|
-
'provider': 'qwen',
|
|
757
|
-
'model_name': 'text-embedding-v3'
|
|
758
|
-
}
|
|
759
|
-
}
|
|
760
|
-
]
|
|
761
|
-
}
|
|
816
|
+
stages=stages
|
|
817
|
+
)
|
|
762
818
|
|
|
763
|
-
pipeline = create_pipeline_from_config(config)
|
|
764
819
|
pipeline.run()
|
|
765
820
|
```
|
|
766
821
|
|
|
767
822
|
### 示例 5: 获取处理统计信息
|
|
768
823
|
|
|
769
824
|
```python
|
|
770
|
-
from
|
|
825
|
+
from datetime import datetime, timezone
|
|
826
|
+
from xparse_client import (
|
|
827
|
+
Pipeline, LocalSource, LocalDestination,
|
|
828
|
+
ParseConfig, ChunkConfig, EmbedConfig, Stage
|
|
829
|
+
)
|
|
771
830
|
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
831
|
+
# 创建 Pipeline
|
|
832
|
+
source = LocalSource(
|
|
833
|
+
directory='./docs',
|
|
834
|
+
pattern='*.pdf',
|
|
835
|
+
recursive=False
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
destination = LocalDestination(output_dir='./output')
|
|
839
|
+
|
|
840
|
+
stages = [
|
|
841
|
+
Stage(
|
|
842
|
+
type='parse',
|
|
843
|
+
config=ParseConfig(provider='textin')
|
|
844
|
+
),
|
|
845
|
+
Stage(
|
|
846
|
+
type='chunk',
|
|
847
|
+
config=ChunkConfig(
|
|
848
|
+
strategy='basic',
|
|
849
|
+
max_characters=1024
|
|
850
|
+
)
|
|
851
|
+
),
|
|
852
|
+
Stage(
|
|
853
|
+
type='embed',
|
|
854
|
+
config=EmbedConfig(
|
|
855
|
+
provider='qwen',
|
|
856
|
+
model_name='text-embedding-v3'
|
|
857
|
+
)
|
|
858
|
+
)
|
|
859
|
+
]
|
|
860
|
+
|
|
861
|
+
pipeline = Pipeline(
|
|
862
|
+
source=source,
|
|
863
|
+
destination=destination,
|
|
864
|
+
api_base_url='https://api.textin.com/api/xparse',
|
|
865
|
+
api_headers={
|
|
784
866
|
'x-ti-app-id': 'your-app-id',
|
|
785
867
|
'x-ti-secret-code': 'your-secret-code'
|
|
786
868
|
},
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
'type': 'parse',
|
|
790
|
-
'config': {
|
|
791
|
-
'provider': 'textin'
|
|
792
|
-
}
|
|
793
|
-
},
|
|
794
|
-
{
|
|
795
|
-
'type': 'chunk',
|
|
796
|
-
'config': {
|
|
797
|
-
'strategy': 'basic',
|
|
798
|
-
'max_characters': 1024
|
|
799
|
-
}
|
|
800
|
-
},
|
|
801
|
-
{
|
|
802
|
-
'type': 'embed',
|
|
803
|
-
'config': {
|
|
804
|
-
'provider': 'qwen',
|
|
805
|
-
'model_name': 'text-embedding-v3'
|
|
806
|
-
}
|
|
807
|
-
}
|
|
808
|
-
]
|
|
809
|
-
}
|
|
810
|
-
|
|
811
|
-
pipeline = create_pipeline_from_config(config)
|
|
869
|
+
stages=stages
|
|
870
|
+
)
|
|
812
871
|
|
|
813
872
|
# 处理单个文件并获取统计信息
|
|
814
873
|
file_bytes, data_source = pipeline.source.read_file('document.pdf')
|