langchain-xparse 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_xparse-1.0.0/LICENSE +21 -0
- langchain_xparse-1.0.0/PKG-INFO +143 -0
- langchain_xparse-1.0.0/README.md +120 -0
- langchain_xparse-1.0.0/langchain_xparse/__init__.py +20 -0
- langchain_xparse-1.0.0/langchain_xparse/client.py +92 -0
- langchain_xparse-1.0.0/langchain_xparse/document_loaders.py +274 -0
- langchain_xparse-1.0.0/langchain_xparse/py.typed +0 -0
- langchain_xparse-1.0.0/langchain_xparse.egg-info/PKG-INFO +143 -0
- langchain_xparse-1.0.0/langchain_xparse.egg-info/SOURCES.txt +12 -0
- langchain_xparse-1.0.0/langchain_xparse.egg-info/dependency_links.txt +1 -0
- langchain_xparse-1.0.0/langchain_xparse.egg-info/requires.txt +7 -0
- langchain_xparse-1.0.0/langchain_xparse.egg-info/top_level.txt +1 -0
- langchain_xparse-1.0.0/pyproject.toml +46 -0
- langchain_xparse-1.0.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 wangxuetong123456
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: langchain-xparse
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: LangChain integration with xParse Pipeline API for document parsing, chunking and embedding
|
|
5
|
+
Author-email: intsig-textin <wangxuetongxztx@163.com>
|
|
6
|
+
Maintainer-email: intsig-textin <wangxuetongxztx@163.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/intsig-textin/langchain-xparse
|
|
9
|
+
Project-URL: Source, https://github.com/intsig-textin/langchain-xparse
|
|
10
|
+
Project-URL: Documentation, https://docs.textin.com/pipeline/overview
|
|
11
|
+
Project-URL: Bug Tracker, https://github.com/intsig-textin/langchain-xparse/issues
|
|
12
|
+
Keywords: langchain,xparse,document-loader,textin,parsing
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: langchain-core>=1.0
|
|
17
|
+
Requires-Dist: httpx>=0.24
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
21
|
+
Requires-Dist: python-dotenv>=1.0; extra == "dev"
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# langchain-xparse
|
|
25
|
+
|
|
26
|
+
LangChain integration with [xParse Pipeline API](https://docs.textin.com/pipeline/overview) for document parsing, chunking and embedding. Supports parse / chunk / embed stages only (extract is not supported in this loader).
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
From PyPI:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install langchain-xparse
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Local editable install:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install -e .
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Configuration
|
|
43
|
+
|
|
44
|
+
Set your TextIn credentials (from [Textin Workspace](https://www.textin.com/console/dashboard/setting) ):
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
export XPARSE_APP_ID="your-app-id"
|
|
48
|
+
export XPARSE_SECRET_CODE="your-secret-code"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Or pass them when creating the loader:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
loader = XParseLoader(
|
|
55
|
+
file_path="doc.pdf",
|
|
56
|
+
app_id="your-app-id",
|
|
57
|
+
secret_code="your-secret-code",
|
|
58
|
+
)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Usage
|
|
62
|
+
|
|
63
|
+
### Basic (parse only)
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from langchain_xparse import XParseLoader
|
|
67
|
+
|
|
68
|
+
loader = XParseLoader(file_path="example.pdf")
|
|
69
|
+
docs = loader.load()
|
|
70
|
+
print(docs[0].page_content[:200])
|
|
71
|
+
print(docs[0].metadata) # source, category, element_id, filename, page_number, ...
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Lazy load
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
for doc in loader.lazy_load():
|
|
78
|
+
# process(doc)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Async
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
async for doc in loader.alazy_load():
|
|
85
|
+
# process(doc)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Convenience params (parse + chunk, or parse + chunk + embed)
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
loader = XParseLoader(
|
|
92
|
+
file_path="doc.pdf",
|
|
93
|
+
parse_provider="textin",
|
|
94
|
+
chunk_strategy="by_title",
|
|
95
|
+
chunk_max_characters=500,
|
|
96
|
+
chunk_overlap=50,
|
|
97
|
+
)
|
|
98
|
+
# Or with embed:
|
|
99
|
+
loader = XParseLoader(
|
|
100
|
+
file_path="doc.pdf",
|
|
101
|
+
parse_provider="textin",
|
|
102
|
+
chunk_strategy="basic",
|
|
103
|
+
chunk_max_characters=1000,
|
|
104
|
+
embed_provider="qwen",
|
|
105
|
+
embed_model_name="text-embedding-v4",
|
|
106
|
+
)
|
|
107
|
+
docs = loader.load()
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Custom stages (advanced)
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
loader = XParseLoader(
|
|
114
|
+
file_path="doc.pdf",
|
|
115
|
+
stages=[
|
|
116
|
+
{"type": "parse", "config": {"provider": "textin"}},
|
|
117
|
+
{"type": "chunk", "config": {"strategy": "by_page", "max_characters": 800}},
|
|
118
|
+
],
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Multiple files
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
loader = XParseLoader(file_path=["a.pdf", "b.pdf"])
|
|
126
|
+
for doc in loader.lazy_load():
|
|
127
|
+
print(doc.metadata.get("source"), doc.page_content[:50])
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### File-like object
|
|
131
|
+
|
|
132
|
+
When passing a file-like object instead of a path, you must set `metadata_filename`:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
with open("doc.pdf", "rb") as f:
|
|
136
|
+
loader = XParseLoader(file=f, metadata_filename="doc.pdf")
|
|
137
|
+
docs = loader.load()
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## References
|
|
141
|
+
|
|
142
|
+
- [xParse overview](https://docs.textin.com/pipeline/overview)
|
|
143
|
+
- [Pipeline API](https://docs.textin.com/api-reference/endpoint/pipeline)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# langchain-xparse
|
|
2
|
+
|
|
3
|
+
LangChain integration with [xParse Pipeline API](https://docs.textin.com/pipeline/overview) for document parsing, chunking and embedding. Supports parse / chunk / embed stages only (extract is not supported in this loader).
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
From PyPI:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install langchain-xparse
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Local editable install:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install -e .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Configuration
|
|
20
|
+
|
|
21
|
+
Set your TextIn credentials (from [Textin Workspace](https://www.textin.com/console/dashboard/setting) ):
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
export XPARSE_APP_ID="your-app-id"
|
|
25
|
+
export XPARSE_SECRET_CODE="your-secret-code"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Or pass them when creating the loader:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
loader = XParseLoader(
|
|
32
|
+
file_path="doc.pdf",
|
|
33
|
+
app_id="your-app-id",
|
|
34
|
+
secret_code="your-secret-code",
|
|
35
|
+
)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
### Basic (parse only)
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from langchain_xparse import XParseLoader
|
|
44
|
+
|
|
45
|
+
loader = XParseLoader(file_path="example.pdf")
|
|
46
|
+
docs = loader.load()
|
|
47
|
+
print(docs[0].page_content[:200])
|
|
48
|
+
print(docs[0].metadata) # source, category, element_id, filename, page_number, ...
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Lazy load
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
for doc in loader.lazy_load():
|
|
55
|
+
# process(doc)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Async
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
async for doc in loader.alazy_load():
|
|
62
|
+
# process(doc)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Convenience params (parse + chunk, or parse + chunk + embed)
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
loader = XParseLoader(
|
|
69
|
+
file_path="doc.pdf",
|
|
70
|
+
parse_provider="textin",
|
|
71
|
+
chunk_strategy="by_title",
|
|
72
|
+
chunk_max_characters=500,
|
|
73
|
+
chunk_overlap=50,
|
|
74
|
+
)
|
|
75
|
+
# Or with embed:
|
|
76
|
+
loader = XParseLoader(
|
|
77
|
+
file_path="doc.pdf",
|
|
78
|
+
parse_provider="textin",
|
|
79
|
+
chunk_strategy="basic",
|
|
80
|
+
chunk_max_characters=1000,
|
|
81
|
+
embed_provider="qwen",
|
|
82
|
+
embed_model_name="text-embedding-v4",
|
|
83
|
+
)
|
|
84
|
+
docs = loader.load()
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Custom stages (advanced)
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
loader = XParseLoader(
|
|
91
|
+
file_path="doc.pdf",
|
|
92
|
+
stages=[
|
|
93
|
+
{"type": "parse", "config": {"provider": "textin"}},
|
|
94
|
+
{"type": "chunk", "config": {"strategy": "by_page", "max_characters": 800}},
|
|
95
|
+
],
|
|
96
|
+
)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Multiple files
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
loader = XParseLoader(file_path=["a.pdf", "b.pdf"])
|
|
103
|
+
for doc in loader.lazy_load():
|
|
104
|
+
print(doc.metadata.get("source"), doc.page_content[:50])
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### File-like object
|
|
108
|
+
|
|
109
|
+
When passing a file-like object instead of a path, you must set `metadata_filename`:
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
with open("doc.pdf", "rb") as f:
|
|
113
|
+
loader = XParseLoader(file=f, metadata_filename="doc.pdf")
|
|
114
|
+
docs = loader.load()
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## References
|
|
118
|
+
|
|
119
|
+
- [xParse overview](https://docs.textin.com/pipeline/overview)
|
|
120
|
+
- [Pipeline API](https://docs.textin.com/api-reference/endpoint/pipeline)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""LangChain integration with xParse Pipeline API."""
|
|
2
|
+
|
|
3
|
+
from importlib import metadata
|
|
4
|
+
|
|
5
|
+
from langchain_xparse.client import XParseAPIError
|
|
6
|
+
from langchain_xparse.document_loaders import XParseLoader
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
__version__ = metadata.version("langchain-xparse")
|
|
10
|
+
except metadata.PackageNotFoundError:
|
|
11
|
+
try:
|
|
12
|
+
__version__ = metadata.version(__package__ or "langchain_xparse")
|
|
13
|
+
except metadata.PackageNotFoundError:
|
|
14
|
+
__version__ = ""
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"XParseAPIError",
|
|
18
|
+
"XParseLoader",
|
|
19
|
+
"__version__",
|
|
20
|
+
]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""xParse Pipeline API client: auth, request/response handling, sync and async."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
DEFAULT_BASE_URL = "https://api.textin.com"
|
|
11
|
+
PIPELINE_PATH = "/api/xparse/pipeline"
|
|
12
|
+
DEFAULT_STAGES = [{"type": "parse", "config": {"provider": "textin"}}]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class XParseAPIError(Exception):
|
|
16
|
+
"""Raised when the Pipeline API returns code != 200."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, code: int, message: str, *args: Any, **kwargs: Any) -> None:
|
|
19
|
+
self.code = code
|
|
20
|
+
self.message = message
|
|
21
|
+
super().__init__(code, message, *args, **kwargs)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PipelineClient:
|
|
25
|
+
"""Client for xParse Pipeline API (sync and async)."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
app_id: str,
|
|
30
|
+
secret_code: str,
|
|
31
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
32
|
+
) -> None:
|
|
33
|
+
self.app_id = app_id
|
|
34
|
+
self.secret_code = secret_code
|
|
35
|
+
self.base_url = base_url.rstrip("/")
|
|
36
|
+
self._headers = {
|
|
37
|
+
"x-ti-app-id": app_id,
|
|
38
|
+
"x-ti-secret-code": secret_code,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
def _url(self) -> str:
|
|
42
|
+
return f"{self.base_url}{PIPELINE_PATH}"
|
|
43
|
+
|
|
44
|
+
def _parse_response(self, response: httpx.Response) -> list[dict[str, Any]]:
|
|
45
|
+
try:
|
|
46
|
+
data = response.json()
|
|
47
|
+
except Exception as e:
|
|
48
|
+
raise XParseAPIError(
|
|
49
|
+
response.status_code,
|
|
50
|
+
f"Invalid JSON response: {e}",
|
|
51
|
+
) from e
|
|
52
|
+
code = data.get("code", response.status_code)
|
|
53
|
+
msg = data.get("message", "")
|
|
54
|
+
if code != 200:
|
|
55
|
+
raise XParseAPIError(code, msg or f"HTTP {response.status_code}")
|
|
56
|
+
# API may return elements at top level or nested in 'data' field
|
|
57
|
+
elements = data.get("elements")
|
|
58
|
+
if elements is None and "data" in data and isinstance(data["data"], dict):
|
|
59
|
+
elements = data["data"].get("elements")
|
|
60
|
+
return elements or []
|
|
61
|
+
|
|
62
|
+
def run_pipeline(
|
|
63
|
+
self,
|
|
64
|
+
file_content: bytes,
|
|
65
|
+
filename: str,
|
|
66
|
+
stages: list[dict[str, Any]] | None = None,
|
|
67
|
+
) -> list[dict[str, Any]]:
|
|
68
|
+
"""Execute the pipeline (sync). Returns list of elements."""
|
|
69
|
+
stages = stages or DEFAULT_STAGES
|
|
70
|
+
url = self._url()
|
|
71
|
+
files = {"file": (filename, file_content)}
|
|
72
|
+
data = {"stages": json.dumps(stages)}
|
|
73
|
+
with httpx.Client(timeout=120.0) as client:
|
|
74
|
+
resp = client.post(url, headers=self._headers, files=files, data=data)
|
|
75
|
+
resp.raise_for_status()
|
|
76
|
+
return self._parse_response(resp)
|
|
77
|
+
|
|
78
|
+
async def arun_pipeline(
|
|
79
|
+
self,
|
|
80
|
+
file_content: bytes,
|
|
81
|
+
filename: str,
|
|
82
|
+
stages: list[dict[str, Any]] | None = None,
|
|
83
|
+
) -> list[dict[str, Any]]:
|
|
84
|
+
"""Execute the pipeline (async). Returns list of elements."""
|
|
85
|
+
stages = stages or DEFAULT_STAGES
|
|
86
|
+
url = self._url()
|
|
87
|
+
files = {"file": (filename, file_content)}
|
|
88
|
+
data = {"stages": json.dumps(stages)}
|
|
89
|
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
90
|
+
resp = await client.post(url, headers=self._headers, files=files, data=data)
|
|
91
|
+
resp.raise_for_status()
|
|
92
|
+
return self._parse_response(resp)
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""xParse Pipeline document loader: XParseLoader and _SingleDocumentLoader."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, AsyncIterator, Callable, Iterator
|
|
8
|
+
|
|
9
|
+
from langchain_core.document_loaders.base import BaseLoader
|
|
10
|
+
from langchain_core.documents import Document
|
|
11
|
+
|
|
12
|
+
from langchain_xparse.client import DEFAULT_STAGES, PipelineClient
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _build_stages(
|
|
16
|
+
*,
|
|
17
|
+
parse_provider: str = "textin",
|
|
18
|
+
chunk_strategy: str | None = None,
|
|
19
|
+
chunk_max_characters: int | None = None,
|
|
20
|
+
chunk_overlap: int | None = None,
|
|
21
|
+
chunk_include_orig_elements: bool = False,
|
|
22
|
+
chunk_new_after_n_chars: int | None = None,
|
|
23
|
+
embed_provider: str | None = None,
|
|
24
|
+
embed_model_name: str | None = None,
|
|
25
|
+
**kwargs: Any,
|
|
26
|
+
) -> list[dict[str, Any]]:
|
|
27
|
+
"""Build Pipeline stages from convenience parameters (parse/chunk/embed only)."""
|
|
28
|
+
stages: list[dict[str, Any]] = [
|
|
29
|
+
{"type": "parse", "config": {"provider": parse_provider}}
|
|
30
|
+
]
|
|
31
|
+
if chunk_strategy is not None:
|
|
32
|
+
chunk_config: dict[str, Any] = {"strategy": chunk_strategy}
|
|
33
|
+
if chunk_max_characters is not None:
|
|
34
|
+
chunk_config["max_characters"] = chunk_max_characters
|
|
35
|
+
if chunk_overlap is not None:
|
|
36
|
+
chunk_config["overlap"] = chunk_overlap
|
|
37
|
+
if chunk_include_orig_elements:
|
|
38
|
+
chunk_config["include_orig_elements"] = True
|
|
39
|
+
if chunk_new_after_n_chars is not None:
|
|
40
|
+
chunk_config["new_after_n_chars"] = chunk_new_after_n_chars
|
|
41
|
+
stages.append({"type": "chunk", "config": chunk_config})
|
|
42
|
+
if embed_provider is not None and embed_model_name is not None:
|
|
43
|
+
stages.append(
|
|
44
|
+
{
|
|
45
|
+
"type": "embed",
|
|
46
|
+
"config": {
|
|
47
|
+
"provider": embed_provider,
|
|
48
|
+
"model_name": embed_model_name,
|
|
49
|
+
},
|
|
50
|
+
}
|
|
51
|
+
)
|
|
52
|
+
return stages
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class _SingleDocumentLoader(BaseLoader):
|
|
56
|
+
"""Loads a single file via xParse Pipeline API into LangChain Documents."""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
client: PipelineClient,
|
|
62
|
+
file_path: str | Path | None = None,
|
|
63
|
+
file: Any = None,
|
|
64
|
+
stages: list[dict[str, Any]],
|
|
65
|
+
post_processors: list[Callable[[str], str]] | None = None,
|
|
66
|
+
metadata_filename: str | None = None,
|
|
67
|
+
) -> None:
|
|
68
|
+
self.client = client
|
|
69
|
+
self.file_path = str(file_path) if isinstance(file_path, Path) else file_path
|
|
70
|
+
self.file = file
|
|
71
|
+
self.stages = stages
|
|
72
|
+
self.post_processors = post_processors or []
|
|
73
|
+
self.metadata_filename = metadata_filename
|
|
74
|
+
|
|
75
|
+
def _file_content(self) -> bytes:
|
|
76
|
+
if self.file is not None:
|
|
77
|
+
return self.file.read()
|
|
78
|
+
if self.file_path:
|
|
79
|
+
with open(self.file_path, "rb") as f:
|
|
80
|
+
return f.read()
|
|
81
|
+
raise ValueError("file or file_path must be defined.")
|
|
82
|
+
|
|
83
|
+
def _filename(self) -> str:
|
|
84
|
+
if self.file_path:
|
|
85
|
+
return Path(self.file_path).name
|
|
86
|
+
if self.metadata_filename:
|
|
87
|
+
return self.metadata_filename
|
|
88
|
+
return "unknown"
|
|
89
|
+
|
|
90
|
+
def _source(self) -> str:
|
|
91
|
+
return self.file_path or self.metadata_filename or ""
|
|
92
|
+
|
|
93
|
+
def _element_to_document(self, element: dict[str, Any]) -> Document:
|
|
94
|
+
text = element.get("text") or ""
|
|
95
|
+
for fn in self.post_processors:
|
|
96
|
+
text = fn(text)
|
|
97
|
+
meta: dict[str, Any] = {
|
|
98
|
+
"source": self._source(),
|
|
99
|
+
"category": element.get("type"),
|
|
100
|
+
"element_id": element.get("element_id"),
|
|
101
|
+
}
|
|
102
|
+
if element.get("metadata"):
|
|
103
|
+
meta.update(element["metadata"])
|
|
104
|
+
if element.get("embeddings") is not None:
|
|
105
|
+
meta["embeddings"] = element["embeddings"]
|
|
106
|
+
return Document(page_content=text, metadata=meta)
|
|
107
|
+
|
|
108
|
+
def lazy_load(self) -> Iterator[Document]:
|
|
109
|
+
content = self._file_content()
|
|
110
|
+
filename = self._filename()
|
|
111
|
+
elements = self.client.run_pipeline(content, filename, self.stages)
|
|
112
|
+
for el in elements:
|
|
113
|
+
yield self._element_to_document(el)
|
|
114
|
+
|
|
115
|
+
async def alazy_load(self) -> AsyncIterator[Document]:
|
|
116
|
+
content = self._file_content()
|
|
117
|
+
filename = self._filename()
|
|
118
|
+
elements = await self.client.arun_pipeline(content, filename, self.stages)
|
|
119
|
+
for el in elements:
|
|
120
|
+
yield self._element_to_document(el)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class XParseLoader(BaseLoader):
|
|
124
|
+
"""Load documents via xParse Pipeline API (parse/chunk/embed; no extract).
|
|
125
|
+
|
|
126
|
+
Setup:
|
|
127
|
+
Set environment variables or pass credentials:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
export XPARSE_APP_ID="your-app-id"
|
|
131
|
+
export XPARSE_SECRET_CODE="your-secret-code"
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Example:
|
|
135
|
+
```python
|
|
136
|
+
from langchain_xparse import XParseLoader
|
|
137
|
+
|
|
138
|
+
loader = XParseLoader(file_path="example.pdf")
|
|
139
|
+
docs = loader.load()
|
|
140
|
+
|
|
141
|
+
# With convenience params (parse + chunk):
|
|
142
|
+
loader = XParseLoader(
|
|
143
|
+
file_path="doc.pdf",
|
|
144
|
+
parse_provider="textin",
|
|
145
|
+
chunk_strategy="by_title",
|
|
146
|
+
chunk_max_characters=500,
|
|
147
|
+
)
|
|
148
|
+
for doc in loader.lazy_load():
|
|
149
|
+
print(doc.page_content[:100], doc.metadata)
|
|
150
|
+
```
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
def __init__(
|
|
154
|
+
self,
|
|
155
|
+
file_path: str | Path | list[str] | list[Path] | None = None,
|
|
156
|
+
*,
|
|
157
|
+
file: Any = None,
|
|
158
|
+
app_id: str | None = None,
|
|
159
|
+
secret_code: str | None = None,
|
|
160
|
+
base_url: str | None = None,
|
|
161
|
+
stages: list[dict[str, Any]] | None = None,
|
|
162
|
+
post_processors: list[Callable[[str], str]] | None = None,
|
|
163
|
+
metadata_filename: str | None = None,
|
|
164
|
+
parse_provider: str = "textin",
|
|
165
|
+
chunk_strategy: str | None = None,
|
|
166
|
+
chunk_max_characters: int | None = None,
|
|
167
|
+
chunk_overlap: int | None = None,
|
|
168
|
+
chunk_include_orig_elements: bool = False,
|
|
169
|
+
chunk_new_after_n_chars: int | None = None,
|
|
170
|
+
embed_provider: str | None = None,
|
|
171
|
+
embed_model_name: str | None = None,
|
|
172
|
+
**kwargs: Any,
|
|
173
|
+
) -> None:
|
|
174
|
+
if file_path is not None and file is not None:
|
|
175
|
+
raise ValueError("file_path and file cannot be defined simultaneously.")
|
|
176
|
+
if file is not None and metadata_filename is None:
|
|
177
|
+
raise ValueError(
|
|
178
|
+
"When using file (file-like), metadata_filename must be specified."
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
self._app_id = app_id or os.getenv("XPARSE_APP_ID") or ""
|
|
182
|
+
self._secret_code = secret_code or os.getenv("XPARSE_SECRET_CODE") or ""
|
|
183
|
+
self._base_url = base_url or "https://api.textin.com"
|
|
184
|
+
self.file_path = file_path
|
|
185
|
+
self.file = file
|
|
186
|
+
self.post_processors = post_processors or []
|
|
187
|
+
self.metadata_filename = metadata_filename
|
|
188
|
+
|
|
189
|
+
if stages is not None:
|
|
190
|
+
self._stages = stages
|
|
191
|
+
else:
|
|
192
|
+
self._stages = _build_stages(
|
|
193
|
+
parse_provider=parse_provider,
|
|
194
|
+
chunk_strategy=chunk_strategy,
|
|
195
|
+
chunk_max_characters=chunk_max_characters,
|
|
196
|
+
chunk_overlap=chunk_overlap,
|
|
197
|
+
chunk_include_orig_elements=chunk_include_orig_elements,
|
|
198
|
+
chunk_new_after_n_chars=chunk_new_after_n_chars,
|
|
199
|
+
embed_provider=embed_provider,
|
|
200
|
+
embed_model_name=embed_model_name,
|
|
201
|
+
**kwargs,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
self._client = PipelineClient(
|
|
205
|
+
app_id=self._app_id,
|
|
206
|
+
secret_code=self._secret_code,
|
|
207
|
+
base_url=self._base_url,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def _load_one(
|
|
211
|
+
self,
|
|
212
|
+
f_path: str | Path | None = None,
|
|
213
|
+
f: Any = None,
|
|
214
|
+
meta_filename: str | None = None,
|
|
215
|
+
) -> Iterator[Document]:
|
|
216
|
+
single = _SingleDocumentLoader(
|
|
217
|
+
client=self._client,
|
|
218
|
+
file_path=str(f_path) if f_path is not None else None,
|
|
219
|
+
file=f,
|
|
220
|
+
stages=self._stages,
|
|
221
|
+
post_processors=self.post_processors,
|
|
222
|
+
metadata_filename=meta_filename,
|
|
223
|
+
)
|
|
224
|
+
yield from single.lazy_load()
|
|
225
|
+
|
|
226
|
+
async def _aload_one(
|
|
227
|
+
self,
|
|
228
|
+
f_path: str | Path | None = None,
|
|
229
|
+
f: Any = None,
|
|
230
|
+
meta_filename: str | None = None,
|
|
231
|
+
) -> AsyncIterator[Document]:
|
|
232
|
+
single = _SingleDocumentLoader(
|
|
233
|
+
client=self._client,
|
|
234
|
+
file_path=str(f_path) if f_path is not None else None,
|
|
235
|
+
file=f,
|
|
236
|
+
stages=self._stages,
|
|
237
|
+
post_processors=self.post_processors,
|
|
238
|
+
metadata_filename=meta_filename,
|
|
239
|
+
)
|
|
240
|
+
async for doc in single.alazy_load():
|
|
241
|
+
yield doc
|
|
242
|
+
|
|
243
|
+
def lazy_load(self) -> Iterator[Document]:
|
|
244
|
+
if isinstance(self.file, list):
|
|
245
|
+
for f in self.file:
|
|
246
|
+
yield from self._load_one(f=f, meta_filename=self.metadata_filename)
|
|
247
|
+
return
|
|
248
|
+
if isinstance(self.file_path, list):
|
|
249
|
+
for p in self.file_path:
|
|
250
|
+
yield from self._load_one(f_path=p)
|
|
251
|
+
return
|
|
252
|
+
yield from self._load_one(
|
|
253
|
+
f_path=self.file_path,
|
|
254
|
+
f=self.file,
|
|
255
|
+
meta_filename=self.metadata_filename,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
async def alazy_load(self) -> AsyncIterator[Document]:
|
|
259
|
+
if isinstance(self.file, list):
|
|
260
|
+
for f in self.file:
|
|
261
|
+
async for doc in self._aload_one(f=f, meta_filename=self.metadata_filename):
|
|
262
|
+
yield doc
|
|
263
|
+
return
|
|
264
|
+
if isinstance(self.file_path, list):
|
|
265
|
+
for p in self.file_path:
|
|
266
|
+
async for doc in self._aload_one(f_path=p):
|
|
267
|
+
yield doc
|
|
268
|
+
return
|
|
269
|
+
async for doc in self._aload_one(
|
|
270
|
+
f_path=self.file_path,
|
|
271
|
+
f=self.file,
|
|
272
|
+
meta_filename=self.metadata_filename,
|
|
273
|
+
):
|
|
274
|
+
yield doc
|
|
File without changes
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: langchain-xparse
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: LangChain integration with xParse Pipeline API for document parsing, chunking and embedding
|
|
5
|
+
Author-email: intsig-textin <wangxuetongxztx@163.com>
|
|
6
|
+
Maintainer-email: intsig-textin <wangxuetongxztx@163.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/intsig-textin/langchain-xparse
|
|
9
|
+
Project-URL: Source, https://github.com/intsig-textin/langchain-xparse
|
|
10
|
+
Project-URL: Documentation, https://docs.textin.com/pipeline/overview
|
|
11
|
+
Project-URL: Bug Tracker, https://github.com/intsig-textin/langchain-xparse/issues
|
|
12
|
+
Keywords: langchain,xparse,document-loader,textin,parsing
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: langchain-core>=1.0
|
|
17
|
+
Requires-Dist: httpx>=0.24
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
21
|
+
Requires-Dist: python-dotenv>=1.0; extra == "dev"
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# langchain-xparse
|
|
25
|
+
|
|
26
|
+
LangChain integration with [xParse Pipeline API](https://docs.textin.com/pipeline/overview) for document parsing, chunking and embedding. Supports parse / chunk / embed stages only (extract is not supported in this loader).
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
From PyPI:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install langchain-xparse
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Local editable install:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install -e .
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Configuration
|
|
43
|
+
|
|
44
|
+
Set your TextIn credentials (from [Textin Workspace](https://www.textin.com/console/dashboard/setting) ):
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
export XPARSE_APP_ID="your-app-id"
|
|
48
|
+
export XPARSE_SECRET_CODE="your-secret-code"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Or pass them when creating the loader:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
loader = XParseLoader(
|
|
55
|
+
file_path="doc.pdf",
|
|
56
|
+
app_id="your-app-id",
|
|
57
|
+
secret_code="your-secret-code",
|
|
58
|
+
)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Usage
|
|
62
|
+
|
|
63
|
+
### Basic (parse only)
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from langchain_xparse import XParseLoader
|
|
67
|
+
|
|
68
|
+
loader = XParseLoader(file_path="example.pdf")
|
|
69
|
+
docs = loader.load()
|
|
70
|
+
print(docs[0].page_content[:200])
|
|
71
|
+
print(docs[0].metadata) # source, category, element_id, filename, page_number, ...
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Lazy load
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
for doc in loader.lazy_load():
|
|
78
|
+
# process(doc)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Async
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
async for doc in loader.alazy_load():
|
|
85
|
+
# process(doc)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Convenience params (parse + chunk, or parse + chunk + embed)
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
loader = XParseLoader(
|
|
92
|
+
file_path="doc.pdf",
|
|
93
|
+
parse_provider="textin",
|
|
94
|
+
chunk_strategy="by_title",
|
|
95
|
+
chunk_max_characters=500,
|
|
96
|
+
chunk_overlap=50,
|
|
97
|
+
)
|
|
98
|
+
# Or with embed:
|
|
99
|
+
loader = XParseLoader(
|
|
100
|
+
file_path="doc.pdf",
|
|
101
|
+
parse_provider="textin",
|
|
102
|
+
chunk_strategy="basic",
|
|
103
|
+
chunk_max_characters=1000,
|
|
104
|
+
embed_provider="qwen",
|
|
105
|
+
embed_model_name="text-embedding-v4",
|
|
106
|
+
)
|
|
107
|
+
docs = loader.load()
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Custom stages (advanced)
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
loader = XParseLoader(
|
|
114
|
+
file_path="doc.pdf",
|
|
115
|
+
stages=[
|
|
116
|
+
{"type": "parse", "config": {"provider": "textin"}},
|
|
117
|
+
{"type": "chunk", "config": {"strategy": "by_page", "max_characters": 800}},
|
|
118
|
+
],
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Multiple files
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
loader = XParseLoader(file_path=["a.pdf", "b.pdf"])
|
|
126
|
+
for doc in loader.lazy_load():
|
|
127
|
+
print(doc.metadata.get("source"), doc.page_content[:50])
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### File-like object
|
|
131
|
+
|
|
132
|
+
When passing a file-like object instead of a path, you must set `metadata_filename`:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
with open("doc.pdf", "rb") as f:
|
|
136
|
+
loader = XParseLoader(file=f, metadata_filename="doc.pdf")
|
|
137
|
+
docs = loader.load()
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## References
|
|
141
|
+
|
|
142
|
+
- [xParse overview](https://docs.textin.com/pipeline/overview)
|
|
143
|
+
- [Pipeline API](https://docs.textin.com/api-reference/endpoint/pipeline)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
langchain_xparse/__init__.py
|
|
5
|
+
langchain_xparse/client.py
|
|
6
|
+
langchain_xparse/document_loaders.py
|
|
7
|
+
langchain_xparse/py.typed
|
|
8
|
+
langchain_xparse.egg-info/PKG-INFO
|
|
9
|
+
langchain_xparse.egg-info/SOURCES.txt
|
|
10
|
+
langchain_xparse.egg-info/dependency_links.txt
|
|
11
|
+
langchain_xparse.egg-info/requires.txt
|
|
12
|
+
langchain_xparse.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
langchain_xparse
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "langchain-xparse"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "LangChain integration with xParse Pipeline API for document parsing, chunking and embedding"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "intsig-textin", email = "wangxuetongxztx@163.com"}
|
|
14
|
+
]
|
|
15
|
+
maintainers = [
|
|
16
|
+
{name = "intsig-textin", email = "wangxuetongxztx@163.com"}
|
|
17
|
+
]
|
|
18
|
+
keywords = ["langchain", "xparse", "document-loader", "textin", "parsing"]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"langchain-core>=1.0",
|
|
21
|
+
"httpx>=0.24",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
dev = [
|
|
26
|
+
"pytest>=7",
|
|
27
|
+
"pytest-asyncio>=0.21",
|
|
28
|
+
"python-dotenv>=1.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://github.com/intsig-textin/langchain-xparse"
|
|
33
|
+
Source = "https://github.com/intsig-textin/langchain-xparse"
|
|
34
|
+
Documentation = "https://docs.textin.com/pipeline/overview"
|
|
35
|
+
"Bug Tracker" = "https://github.com/intsig-textin/langchain-xparse/issues"
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
where = ["."]
|
|
39
|
+
include = ["langchain_xparse*"]
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
asyncio_mode = "auto"
|
|
43
|
+
testpaths = ["tests"]
|
|
44
|
+
markers = [
|
|
45
|
+
"integration: marks tests as integration (require XPARSE_APP_ID, XPARSE_SECRET_CODE)",
|
|
46
|
+
]
|