xparse-client 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +111 -20
  31. xparse_client/_base.py +179 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +350 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +215 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +134 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +136 -0
  62. xparse_client/models/pipeline.py +134 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b3.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +1 -0
  69. example/run_pipeline.py +0 -506
  70. example/run_pipeline_test.py +0 -458
  71. xparse_client/pipeline/__init__.py +0 -3
  72. xparse_client/pipeline/config.py +0 -129
  73. xparse_client/pipeline/destinations.py +0 -487
  74. xparse_client/pipeline/pipeline.py +0 -622
  75. xparse_client/pipeline/sources.py +0 -585
  76. xparse_client-0.2.11.dist-info/METADATA +0 -1050
  77. xparse_client-0.2.11.dist-info/RECORD +0 -13
@@ -0,0 +1,68 @@
1
+ example/1_basic_api_usage.py,sha256=x_ZHEWXz6z7qp-sMLBvq7Vpu6nA7YxVCizyQpDday6M,5802
2
+ example/2_async_job.py,sha256=MCjWii3ksmONiIyCWviBrKPZIwrlCPDvmUPP_F8qXPM,6353
3
+ example/3_local_workflow.py,sha256=ZxoT8N0Nz4_s-bkLF0l6UDqBJk_eZZi99WUtUhJseXs,8980
4
+ example/4_advanced_workflow.py,sha256=cgG0xke8-WNPxqx7iSTl9qgoJsVctXf5mjDkp-SaQho,10159
5
+ example/README.md,sha256=d-DumaZwbV96K6ZTEryAlDSk9HUhRBg0gG_Yekx9PqA,2336
6
+ example/config_example.json,sha256=mBaLmKbgxFBKUgA-FtCTiLMtq24z0QRwKbPeYDGwA9I,2517
7
+ tests/conftest.py,sha256=h6sy9RNQmXTQNTu86h7ATDAg-9wI4cKjzKXUbNNH5q8,10037
8
+ tests/unit/__init__.py,sha256=C1308ox8hFFtq3A1vBAVPzL6r8HGVbEbC42ULCpFkjs,25
9
+ tests/unit/test_base.py,sha256=HRW2irYNAX863PBCtJWvN9EJ_UttfOwRI7CRnDdcjEk,11599
10
+ tests/unit/test_client.py,sha256=92geEJ6jBht7nFr5suRbJ6SmkvzCu_HGB5MhSl4Qmr8,3534
11
+ tests/unit/test_config.py,sha256=OAUKvmEtpC-eAglnDzhp1bUyMlUpaeqLAa1QwyxCUdw,5498
12
+ tests/unit/test_exceptions.py,sha256=BEaGSs34_jmdjUuT4e3S3QONDqldamiXNId04PPdMt4,5155
13
+ tests/unit/test_http.py,sha256=3h-O6k5rcVP_I1sPoISOYLdHAKtSxoPrDOfC3EJyNi0,16490
14
+ tests/unit/api/__init__.py,sha256=5HiLverbqHC-yFxXClrHBjHq62KoONLZPLMMOkYR3eU,23
15
+ tests/unit/api/test_extract.py,sha256=QX4_BAQN6MkRTJQocEf9u6HwOA_XoY_a2djMcuLBCTo,6959
16
+ tests/unit/api/test_local.py,sha256=_DXC8PattJ-CH9inlwDu9s1RCocTGVeYpKFpTiwDwa4,7541
17
+ tests/unit/api/test_parse.py,sha256=EUA3emZFAFefop3CcUgqyLGC1lhd5ujx6x-dGE5h3pw,12639
18
+ tests/unit/api/test_pipeline.py,sha256=0TC_upZbSllQfuz7uGOCvx28BDNCnX1POR92JLnfrIE,11067
19
+ tests/unit/api/test_workflows.py,sha256=lYVbOWph8ZqxszFQTgcFHnxgdp9jjJM35F-wKO3IUuY,3286
20
+ tests/unit/connectors/test_ftp.py,sha256=7NbV7cL9tIy03L7_sWoALrfDmjda_xOgfAoBfUnF9ks,14605
21
+ tests/unit/connectors/test_local_connectors.py,sha256=QS2T_eLXJtHSaOrbtwYrRSgAVrL3jkO3QVPoMEpi9d8,9855
22
+ tests/unit/connectors/test_milvus.py,sha256=X2VRv639QSntpjx_7u1cOwJg0HSKPumuDmd26FWxMKA,11911
23
+ tests/unit/connectors/test_qdrant.py,sha256=wCIG36CjbDhF0Jq53jnNzMliq5eXMWgnCPiIHtVBbew,12682
24
+ tests/unit/connectors/test_s3.py,sha256=Xbo-S9khgS1S3LM3BUIuYp-28YLEmEW0SP1O-iJ8g5g,18444
25
+ tests/unit/connectors/test_smb.py,sha256=yrI_xNeOe84vONyxXNOGLbWa7tcyxjzo9G5Z_Tjzn-w,12378
26
+ tests/unit/connectors/test_utils.py,sha256=RfnzTW5-9w2f5YeLbNTB_z3vVnsUSL7DwXNobo7xZUA,10302
27
+ tests/unit/models/test_local.py,sha256=QnxzWKOivqeITd0CPzZLEzGbaMDtiwsPiwyEHiPpcPY,1325
28
+ tests/unit/models/test_pipeline_stages.py,sha256=vT7Pd0lRWdChTQJuCZ3iUBx8C0xFXQXTBVB6jtWtEKU,3972
29
+ tests/unit/models/test_workflows.py,sha256=Al4d3SQI1jTz585i1kD569TZx4IWNrhyf3_Le6-cq2k,1692
30
+ xparse_client/__init__.py,sha256=FyR7W0eteapDoPj2pU4VoTobM8XR2B7f_g4yVI-v2v8,3956
31
+ xparse_client/_base.py,sha256=Vx85kDns565zwv6Zl6_8jl6b43nNprfBHbqdCVPGGTc,4420
32
+ xparse_client/_client.py,sha256=ZGxZ9xhUitTP-eRhte57ekqUkrXxNubFDL5gojajA2s,6964
33
+ xparse_client/_config.py,sha256=BG3cEW5ywKhmtfHjfTFrdezs2Ugc9P2Y85tzu5OZZ4M,6480
34
+ xparse_client/_http.py,sha256=OrnrkTeuG5tEpuD-a1ixLIFPdiqnVIYtuZCi3uXAzJ4,11381
35
+ xparse_client/exceptions.py,sha256=FNozgCFaQSsdeyxE6P-3CZItkTSfFJcIQVrddHWKN0s,9521
36
+ xparse_client/api/__init__.py,sha256=MZ3wBTDU666efOBv81MKKtc1UkjUsjzIEOpsnDUAHD4,203
37
+ xparse_client/api/extract.py,sha256=87tH-y4sXPKW35l-d0GmZdK9RIXWq4tfmGZXYJhIN60,3011
38
+ xparse_client/api/local.py,sha256=LkwtGFCZifWuWbQlq2Y2dXPB3aXW1x1P89b-o0MJC7s,7662
39
+ xparse_client/api/parse.py,sha256=nvpu5kiN43QlyMW3N6vMruVADu6MdjS-TMbLFjBh-cc,5870
40
+ xparse_client/api/pipeline.py,sha256=bPCSpWjER-7c7L0iMQ_Xz1VTe5DARMsX5XIcrayGKzw,4530
41
+ xparse_client/api/workflows.py,sha256=29_Rf4bIH1fiJb_bWShkUm-et97ym5ymQFh5i3b5mDI,5945
42
+ xparse_client/connectors/__init__.py,sha256=BLGxk9i8BsVCr4RMrCIh8b9cywBqXXmcDHqfF7C_FX0,1113
43
+ xparse_client/connectors/_utils.py,sha256=Qb2OyUg8gvzyMBELx4h0GmjRSdT4d9C45v9m6NUNuMg,3515
44
+ xparse_client/connectors/destinations/__init__.py,sha256=hmvWAXj58yLmePMCEnqKbNhJ74UxErugSFxBxPhlb14,1401
45
+ xparse_client/connectors/destinations/base.py,sha256=DfwwH5S8VPwEUyvWqUKDV9Bq-7A59lqoZHruOjB7L50,3218
46
+ xparse_client/connectors/destinations/local.py,sha256=drZ5wkPMiYd8iOROmLJR3eoCXcYKUpAPPjSFpafeNW8,2542
47
+ xparse_client/connectors/destinations/milvus.py,sha256=MhElUsIMY1g6_c_bD-wjpWOM5PiuZ9bLsgXACW7EY1c,7339
48
+ xparse_client/connectors/destinations/qdrant.py,sha256=Mo1wCwtsM7OIMLZ0HsEBnvE8N9BGnyTGBGJ1B8oYIew,7661
49
+ xparse_client/connectors/destinations/s3.py,sha256=2aeTW0cNUVPLBL8up9InFfgXBcFDPtqFNGa5FbUphhk,4728
50
+ xparse_client/connectors/sources/__init__.py,sha256=830EDkdGJtPC2p1MhIP108uLibGeMoXcXAwPAEACPvQ,1143
51
+ xparse_client/connectors/sources/base.py,sha256=fbRWFWuIcw_XWrOPpPkBnQ0Pf2R9N7Ry4523rxWcOpc,1863
52
+ xparse_client/connectors/sources/ftp.py,sha256=bjOjnd-2l0MZ2EAcGM3jvuTP7FMWBiQaEbgHzhIl3og,8170
53
+ xparse_client/connectors/sources/local.py,sha256=ZYTGWdQmUpzdenn3JQ_p0u9-8x7xXk5Ani2GIAqTGyE,5156
54
+ xparse_client/connectors/sources/s3.py,sha256=RCzinmz9cCqp9cFcfEozBapa2nllFvJCw9P-aLeOJQA,7219
55
+ xparse_client/connectors/sources/smb.py,sha256=_F496mipGZJ0y17H-sYTlqAf2uJScxGyit7kirTyDKc,7786
56
+ xparse_client/models/__init__.py,sha256=zLBFirgXDisEeqKHqpsH1bN5p2zE7ZmCtVENZ69mlOg,1213
57
+ xparse_client/models/chunk.py,sha256=VXKrATgk7bhlPVbJTFOUvOWu9oEnNMgMF6AnQarJD7o,986
58
+ xparse_client/models/embed.py,sha256=Tg9iqOiLZy63m4T0q2z9cq3neuQbMnqDqT55-sVA7CE,1652
59
+ xparse_client/models/extract.py,sha256=ZwMy0MsTt775Bq3UsZ9WlXFNCLrSnN0j1RgXQ-vFNqo,915
60
+ xparse_client/models/local.py,sha256=hqKmyWTU_EhPk6qybtBzmbLybBBaiasTpqM4_zP-ipo,698
61
+ xparse_client/models/parse.py,sha256=K34Fodo3emOPZJJQeu3MLjvA7HkVMxAuXk20UvO4lfQ,3297
62
+ xparse_client/models/pipeline.py,sha256=24bDzhrVotQ8St6VLEJLvG2cZF0G2AMioKI34I_hJXI,3297
63
+ xparse_client/models/workflows.py,sha256=BivMdGOAmhP6oYLQSGAAN7yml2xb7vHHrpzwLgN_Afk,1754
64
+ xparse_client-0.3.0b3.dist-info/licenses/LICENSE,sha256=7iuki7DyWMGB8PBzsht7PUt0YjdIcPjrcXNyUFgMJsw,1070
65
+ xparse_client-0.3.0b3.dist-info/METADATA,sha256=6F2nf_y5odBzTzTLF8N4MA-aqW9F3_EnCCmnPtiqwvg,25453
66
+ xparse_client-0.3.0b3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
67
+ xparse_client-0.3.0b3.dist-info/top_level.txt,sha256=oQGc_qysOmnSAaLjwB72wH8RBHRAmxB-_qb-Uj6u56o,28
68
+ xparse_client-0.3.0b3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2025 ACG Xparse Authors
3
+ Copyright (c) 2025 INTSIG-TEXTIN
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,2 +1,3 @@
1
1
  example
2
+ tests
2
3
  xparse_client
example/run_pipeline.py DELETED
@@ -1,506 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- encoding: utf-8 -*-
3
- '''
4
- Pipeline 运行脚本
5
- 快速启动和运行 Pipeline 的示例
6
- '''
7
-
8
- import json
9
- from datetime import datetime, timezone
10
- from xparse_client import create_pipeline_from_config, S3Source, LocalSource, MilvusDestination, LocalDestination, Pipeline, SmbSource, S3Destination, FtpSource
11
-
12
-
13
- # ============================================================================
14
- # 常量配置
15
- # ============================================================================
16
-
17
- # API 请求头配置
18
- API_HEADERS = {
19
- 'x-ti-app-id': '4c0032d9e4d93b0ad674cac0d75256e7',
20
- 'x-ti-secret-code': '7104f599ad02b8468fc619f7605d2d8d'
21
- }
22
-
23
-
24
- # ============================================================================
25
- # 方式 1: 使用配置字典
26
- # ============================================================================
27
-
28
- def run_with_config():
29
- """使用配置字典运行 pipeline"""
30
-
31
- config = {
32
- 'source': {
33
- 'type': 's3',
34
- 'endpoint': 'https://textin-minio-api.ai.intsig.net',
35
- 'access_key': 'IEQspf8C7fVcgmp3AZWl',
36
- 'secret_key': 'kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
37
- 'bucket': 'textin-test',
38
- 'prefix': '', # 留空处理所有文件,或指定如 'milvus/'
39
- 'region': 'us-east-1'
40
- },
41
- 'destination': {
42
- 'type': 'milvus',
43
- 'db_path': './milvus_pipeline.db',
44
- 'collection_name': 'pipeline_collection',
45
- 'dimension': 1024
46
- },
47
- 'api_base_url': 'https://api.textin.com/api/xparse',
48
- 'api_headers': API_HEADERS,
49
- # Stages 配置
50
- 'stages': [
51
- {
52
- 'type': 'parse',
53
- 'config': {
54
- 'provider': 'textin'
55
- }
56
- },
57
- {
58
- 'type': 'chunk',
59
- 'config': {
60
- 'strategy': 'basic', # 分块策略: 'basic' | 'by_title' | 'by_page'
61
- 'include_orig_elements': False, # 是否包含原始元素
62
- 'new_after_n_chars': 512, # 多少字符后创建新块
63
- 'max_characters': 1024, # 最大字符数
64
- 'overlap': 0 # 重叠字符数
65
- }
66
- },
67
- {
68
- 'type': 'embed',
69
- 'config': {
70
- 'provider': 'qwen', # 向量化供应商: 'qwen'
71
- 'model_name': 'text-embedding-v3' # 模型名称: 'text-embedding-v3' | 'text-embedding-v4'
72
- }
73
- }
74
- ]
75
- }
76
-
77
- pipeline = create_pipeline_from_config(config)
78
- pipeline.run()
79
-
80
-
81
- # ============================================================================
82
- # 方式 2: 手动创建组件
83
- # ============================================================================
84
-
85
- def run_with_manual_setup():
86
- """手动创建 Source、Destination 和 Pipeline"""
87
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage, PipelineConfig, LocalDestination, QdrantDestination
88
-
89
- # 创建 S3 数据源
90
- # source = S3Source(
91
- # endpoint='https://textin-minio-api.ai.intsig.net',
92
- # access_key='IEQspf8C7fVcgmp3AZWl',
93
- # secret_key='kLj96I8FGbIrPFW08meXivCy4AVdzBijOJWKWOt1',
94
- # bucket='textin-test',
95
- # prefix='',
96
- # region='us-east-1'
97
- # )
98
- # source = S3Source(
99
- # endpoint='https://s3.oss-cn-shanghai.aliyuncs.com',
100
- # access_key='LTAI5tBgsaVfkbh9rbPyuB17',
101
- # secret_key='JFIIaTGiXelv7DgBYNIBSStofF0S98',
102
- # bucket='textin',
103
- # prefix='',
104
- # region='cn-shanghai'
105
- # )
106
- # source=S3Source(
107
- # endpoint='https://S3.oss-cn-shanghai.aliyuncs.com',
108
- # access_key='LTAI5t6ZnqTra8oLmJEfvcr7',
109
- # secret_key='SEbz4oJ4KNJIOTMfphuVGOWmRpGGUG',
110
- # bucket='textin-test-aliyun',
111
- # prefix='',
112
- # region='cn-shanghai'
113
- # )
114
- # source = S3Source(
115
- # endpoint='https://cos.ap-shanghai.myqcloud.com',
116
- # access_key='',
117
- # secret_key='',
118
- # bucket='textin-1300705866',
119
- # prefix='',
120
- # region='ap-shanghai'
121
- # )
122
- # source = S3Source(
123
- # endpoint='https://tos-s3-cn-shanghai.volces.com',
124
- # access_key='AKLTMzNkZjk1OGM3MzBjNGQ1ZjhkMGQ4MThlNjBjYjZjYzA',
125
- # secret_key='TnpWaE0yRTVaamRqTmpSbU5EY3pObUZrTTJVNE5qUm1NR0ppWkRrMFlqVQ==',
126
- # bucket='textin',
127
- # prefix='',
128
- # region='cn-shanghai'
129
- # )
130
- # source = S3Source(
131
- # endpoint='https://obs.cn-east-3.myhuaweicloud.com',
132
- # access_key='',
133
- # secret_key='',
134
- # bucket='textin',
135
- # prefix='',
136
- # region='cn-east-3'
137
- # )
138
- # source = S3Source(
139
- # endpoint='https://s3.us-east-1.amazonaws.com',
140
- # access_key='AKIA6QUE3TVZADUWA4PO',
141
- # secret_key='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',
142
- # bucket='textin-test',
143
- # prefix='',
144
- # region='us-east-1'
145
- # )
146
- # source = S3Source(
147
- # endpoint='http://127.0.0.1:9000',
148
- # access_key='',
149
- # secret_key='',
150
- # bucket='textin',
151
- # prefix='',
152
- # region='us-east-1'
153
- # )
154
- # source = SmbSource(
155
- # host='internal-storage.intsig.net',
156
- # share_name='ke_wang',
157
- # username='ke_wang',
158
- # password='',
159
- # domain='INTSIG.COM'
160
- # )
161
- # source = FtpSource(
162
- # host='127.0.0.1',
163
- # port=21,
164
- # # recursive=True,
165
- # username='', # 用户名,按照实际填写
166
- # password='' # 密码,按照实际填写
167
- # )
168
- source = LocalSource(
169
- directory='/Users/ke_wang/Documents/doc',
170
- pattern=['*.pdf'],
171
- recursive=True,
172
- )
173
-
174
- # source=S3Source(
175
- # endpoint='https://obs.cn-north-4.myhuaweicloud.com',
176
- # access_key='HPUAFT3D1Q6O6UUN1RWQ',
177
- # secret_key='4zIk8x37nZiDS9P585BTFCWsOSo5G7ok1yRWtEA1',
178
- # bucket='textin-test-ywj',
179
- # prefix='',
180
- # region='cn-north-4'
181
- # )# 华为云
182
-
183
- # 创建 Milvus 目的地
184
- # destination = MilvusDestination(
185
- # db_path='./milvus_pipeline1.db',
186
- # collection_name='pipeline_collection',
187
- # dimension=1024
188
- # )
189
-
190
- destination = LocalDestination(
191
- output_dir='./result'
192
- )
193
-
194
- # destination = MilvusDestination(
195
- # db_path='https://in03-5388093d0db1707.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn', # zilliz连接地址
196
- # collection_name='textin_test_3_copy', # 数据库collection名称
197
- # dimension=1024, # 向量维度,需与 embed API 返回一致
198
- # api_key='872c3f5b3f3995c80dcda5c3d34f1f608815aef7671b6ee391ab37e40e79c892ce56d9c8c6565a03a3fd66da7e11b67f384c5c46' # Zilliz Cloud API Key
199
- # )
200
-
201
- # destination = S3Destination(
202
- # endpoint='https://cos.ap-shanghai.myqcloud.com',
203
- # access_key='',
204
- # secret_key='',
205
- # bucket='textin-1300705866',
206
- # prefix='result',
207
- # region='ap-shanghai'
208
- # )
209
-
210
- # destination = QdrantDestination(
211
- # url='https://1325db22-7dd8-4fc9-930b-f969d4963b3d.us-east-1-1.aws.cloud.qdrant.io:6333',
212
- # collection_name='textin1',
213
- # dimension=1024,
214
- # api_key='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.TGnFB1pAD7c7IqSOvTpgCPpHXSnnoKhWEQ5pQ8DrBnI',
215
- # )
216
-
217
- # 使用新的 stages 格式创建配置
218
- stages = [
219
- Stage(
220
- type='parse',
221
- config=ParseConfig(provider='textin', page_ranges='3')
222
- ),
223
- Stage(
224
- type='chunk',
225
- config=ChunkConfig(
226
- strategy='by_title', # 按标题分块
227
- include_orig_elements=False,
228
- new_after_n_chars=512,
229
- max_characters=1024,
230
- overlap=50 # 块之间重叠 50 字符
231
- )
232
- ),
233
- Stage(
234
- type='embed',
235
- config=EmbedConfig(
236
- provider='qwen',
237
- model_name='text-embedding-v3'
238
- )
239
- )
240
- ]
241
-
242
- # 配置中间结果保存
243
- intermediate_results_destination = LocalDestination(
244
- output_dir='./intermediate_results'
245
- )
246
-
247
- pipeline_config = PipelineConfig(
248
- include_intermediate_results=True,
249
- intermediate_results_destination=intermediate_results_destination
250
- )
251
-
252
- # 创建 Pipeline
253
- pipeline = Pipeline(
254
- source=source,
255
- destination=destination,
256
- api_base_url='https://textin-api-go-pre.ai.intsig.net/api/xparse',
257
- api_headers=API_HEADERS,
258
- stages=stages,
259
- pipeline_config=pipeline_config
260
- )
261
-
262
- # 运行
263
- # config = pipeline.get_config()
264
- pipeline.run()
265
-
266
-
267
- # ============================================================================
268
- # 方式 3: 本地测试(本地文件 -> 本地输出)
269
- # ============================================================================
270
-
271
- def run_local_test():
272
- """使用本地文件进行测试"""
273
-
274
- config = {
275
- 'source': {
276
- 'type': 'local',
277
- 'directory': '/Users/ke_wang/Documents/doc',
278
- 'pattern': '*.pdf'
279
- },
280
- 'destination': {
281
- 'type': 's3',
282
- 'endpoint': 'https://textin-minio-api.ai.intsig.net',
283
- 'access_key': '',
284
- 'secret_key': '',
285
- 'bucket': 'textin-test',
286
- 'prefix': '',
287
- 'region': 'us-east-1'
288
- },
289
- 'api_base_url': 'https://api.textin.com/api/xparse',
290
- 'api_headers': API_HEADERS,
291
- # Stages 配置
292
- 'stages': [
293
- {
294
- 'type': 'parse',
295
- 'config': {
296
- 'provider': 'textin'
297
- }
298
- },
299
- {
300
- 'type': 'embed',
301
- 'config': {
302
- 'provider': 'qwen',
303
- 'model_name': 'text-embedding-v3'
304
- }
305
- }
306
- ]
307
- }
308
-
309
- pipeline = create_pipeline_from_config(config)
310
- pipeline.run()
311
-
312
-
313
- # ============================================================================
314
- # 方式 4: 处理单个文件
315
- # ============================================================================
316
-
317
- def run_single_file():
318
- """只处理单个文件"""
319
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
320
-
321
- # 创建 pipeline
322
- source = LocalSource(directory='/Users/ke_wang/Documents/doc', pattern='*.pdf')
323
- destination = LocalDestination(output_dir='./output')
324
-
325
- # 使用新的 stages 格式创建配置
326
- stages = [
327
- Stage(
328
- type='parse',
329
- config=ParseConfig(provider='textin')
330
- ),
331
- Stage(
332
- type='chunk',
333
- config=ChunkConfig(
334
- strategy='by_page', # 按页面分块
335
- max_characters=2048, # 增大块大小
336
- overlap=100
337
- )
338
- ),
339
- Stage(
340
- type='embed',
341
- config=EmbedConfig(
342
- provider='qwen',
343
- model_name='text-embedding-v4' # 使用更高精度的模型
344
- )
345
- )
346
- ]
347
-
348
- pipeline = Pipeline(
349
- source=source,
350
- destination=destination,
351
- api_base_url='https://api.textin.com/api/xparse',
352
- api_headers=API_HEADERS,
353
- stages=stages
354
- )
355
-
356
- # 只处理指定文件
357
- file_path = '4e3250f00210431fb29ca0c808.pdf' # 相对于 source directory 的路径
358
- success = pipeline.process_file(file_path)
359
-
360
- if success:
361
- print(f"\n✅ 文件 {file_path} 处理成功!")
362
- else:
363
- print(f"\n❌ 文件 {file_path} 处理失败!")
364
-
365
-
366
- # ============================================================================
367
- # 方式 5: 自定义处理流程
368
- # ============================================================================
369
-
370
- def run_custom_flow():
371
- """自定义处理流程,手动控制文件处理"""
372
- from xparse_client import ChunkConfig, EmbedConfig, ParseConfig, Stage
373
-
374
- # 创建组件
375
- source = S3Source(
376
- endpoint='https://textin-minio-api.ai.intsig.net',
377
- access_key='',
378
- secret_key='',
379
- bucket='textin-test',
380
- prefix='',
381
- region='us-east-1',
382
- pattern='*.pdf'
383
- )
384
-
385
- destination = MilvusDestination(
386
- db_path='./milvus_custom.db',
387
- collection_name='custom_collection',
388
- dimension=1024
389
- )
390
-
391
- # 使用新的 stages 格式创建配置
392
- stages = [
393
- Stage(
394
- type='parse',
395
- config=ParseConfig(provider='textin')
396
- ),
397
- Stage(
398
- type='chunk',
399
- config=ChunkConfig(
400
- strategy='by_title',
401
- include_orig_elements=True,
402
- max_characters=1536,
403
- overlap=80
404
- )
405
- ),
406
- Stage(
407
- type='embed',
408
- config=EmbedConfig(
409
- provider='qwen',
410
- model_name='text-embedding-v4'
411
- )
412
- )
413
- ]
414
-
415
- pipeline = Pipeline(
416
- source=source,
417
- destination=destination,
418
- api_base_url='https://api.textin.com/api/xparse',
419
- api_headers=API_HEADERS,
420
- stages=stages
421
- )
422
-
423
- # 手动控制文件处理
424
- files = source.list_files()
425
-
426
- for file_path in files[:2]: # 只处理前2个文件
427
- print(f"\n处理: {file_path}")
428
- file_bytes, data_source = source.read_file(file_path)
429
- data_source['date_processed'] = datetime.now(timezone.utc).timestamp()
430
-
431
- # 使用 pipeline 接口处理
432
- result = pipeline.process_with_pipeline(file_bytes, file_path, data_source)
433
-
434
- if result:
435
- embedded, stats = result
436
- print(f" - 原始元素: {stats.original_elements}")
437
- print(f" - 分块后: {stats.chunked_elements}")
438
- print(f" - 向量化: {stats.embedded_elements}")
439
-
440
- # 写入
441
- metadata = {
442
- 'file_name': file_path,
443
- 'data_source': data_source,
444
- 'stats': {
445
- 'original_elements': stats.original_elements,
446
- 'chunked_elements': stats.chunked_elements,
447
- 'embedded_elements': stats.embedded_elements
448
- }
449
- }
450
- destination.write(embedded, metadata)
451
- print(f"✓ 完成: {file_path}")
452
- else:
453
- print(f"✗ 失败: {file_path}")
454
-
455
-
456
- # ============================================================================
457
- # 主函数
458
- # ============================================================================
459
-
460
- def main():
461
- """主函数 - 选择运行方式"""
462
-
463
- print("=" * 60)
464
- print("Pipeline 运行脚本")
465
- print("=" * 60)
466
- print("\n请选择运行方式:")
467
- print("1. 使用配置字典 (S3 -> Milvus) [基础配置]")
468
- print("2. 手动创建组件 (S3 -> Milvus) [按标题分块 + 自定义配置]")
469
- print("3. 本地测试 (本地文件 -> 本地输出) [基础配置]")
470
- print("4. 处理单个文件 [按页面分块 + V4模型]")
471
- print("5. 自定义处理流程 [手动控制 + 统计信息]")
472
- print()
473
-
474
- try:
475
- choice = input("请输入选项 (1-5) [默认: 1]: ").strip() or '1'
476
-
477
- if choice == '1':
478
- print("\n使用配置字典运行...")
479
- run_with_config()
480
- elif choice == '2':
481
- print("\n手动创建组件运行...")
482
- run_with_manual_setup()
483
- elif choice == '3':
484
- print("\n本地测试模式...")
485
- run_local_test()
486
- elif choice == '4':
487
- print("\n处理单个文件...")
488
- run_single_file()
489
- elif choice == '5':
490
- print("\n自定义处理流程...")
491
- run_custom_flow()
492
- else:
493
- print("无效的选项,使用默认方式运行...")
494
- run_with_config()
495
-
496
- except KeyboardInterrupt:
497
- print("\n\n用户中断执行")
498
- except Exception as e:
499
- print(f"\n程序异常: {str(e)}")
500
- import traceback
501
- traceback.print_exc()
502
-
503
-
504
- if __name__ == '__main__':
505
- main()
506
-