langchain-xparse 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 wangxuetong123456
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: langchain-xparse
3
+ Version: 1.0.0
4
+ Summary: LangChain integration with xParse Pipeline API for document parsing, chunking and embedding
5
+ Author-email: intsig-textin <wangxuetongxztx@163.com>
6
+ Maintainer-email: intsig-textin <wangxuetongxztx@163.com>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/intsig-textin/langchain-xparse
9
+ Project-URL: Source, https://github.com/intsig-textin/langchain-xparse
10
+ Project-URL: Documentation, https://docs.textin.com/pipeline/overview
11
+ Project-URL: Bug Tracker, https://github.com/intsig-textin/langchain-xparse/issues
12
+ Keywords: langchain,xparse,document-loader,textin,parsing
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: langchain-core>=1.0
17
+ Requires-Dist: httpx>=0.24
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=7; extra == "dev"
20
+ Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
21
+ Requires-Dist: python-dotenv>=1.0; extra == "dev"
22
+ Dynamic: license-file
23
+
24
+ # langchain-xparse
25
+
26
+ LangChain integration with [xParse Pipeline API](https://docs.textin.com/pipeline/overview) for document parsing, chunking and embedding. Supports parse / chunk / embed stages only (extract is not supported in this loader).
27
+
28
+ ## Installation
29
+
30
+ From PyPI:
31
+
32
+ ```bash
33
+ pip install langchain-xparse
34
+ ```
35
+
36
+ Local editable install:
37
+
38
+ ```bash
39
+ pip install -e .
40
+ ```
41
+
42
+ ## Configuration
43
+
44
+ Set your TextIn credentials (from [Textin Workspace](https://www.textin.com/console/dashboard/setting) ):
45
+
46
+ ```bash
47
+ export XPARSE_APP_ID="your-app-id"
48
+ export XPARSE_SECRET_CODE="your-secret-code"
49
+ ```
50
+
51
+ Or pass them when creating the loader:
52
+
53
+ ```python
54
+ loader = XParseLoader(
55
+ file_path="doc.pdf",
56
+ app_id="your-app-id",
57
+ secret_code="your-secret-code",
58
+ )
59
+ ```
60
+
61
+ ## Usage
62
+
63
+ ### Basic (parse only)
64
+
65
+ ```python
66
+ from langchain_xparse import XParseLoader
67
+
68
+ loader = XParseLoader(file_path="example.pdf")
69
+ docs = loader.load()
70
+ print(docs[0].page_content[:200])
71
+ print(docs[0].metadata) # source, category, element_id, filename, page_number, ...
72
+ ```
73
+
74
+ ### Lazy load
75
+
76
+ ```python
77
+ for doc in loader.lazy_load():
78
+ # process(doc)
79
+ ```
80
+
81
+ ### Async
82
+
83
+ ```python
84
+ async for doc in loader.alazy_load():
85
+ # process(doc)
86
+ ```
87
+
88
+ ### Convenience params (parse + chunk, or parse + chunk + embed)
89
+
90
+ ```python
91
+ loader = XParseLoader(
92
+ file_path="doc.pdf",
93
+ parse_provider="textin",
94
+ chunk_strategy="by_title",
95
+ chunk_max_characters=500,
96
+ chunk_overlap=50,
97
+ )
98
+ # Or with embed:
99
+ loader = XParseLoader(
100
+ file_path="doc.pdf",
101
+ parse_provider="textin",
102
+ chunk_strategy="basic",
103
+ chunk_max_characters=1000,
104
+ embed_provider="qwen",
105
+ embed_model_name="text-embedding-v4",
106
+ )
107
+ docs = loader.load()
108
+ ```
109
+
110
+ ### Custom stages (advanced)
111
+
112
+ ```python
113
+ loader = XParseLoader(
114
+ file_path="doc.pdf",
115
+ stages=[
116
+ {"type": "parse", "config": {"provider": "textin"}},
117
+ {"type": "chunk", "config": {"strategy": "by_page", "max_characters": 800}},
118
+ ],
119
+ )
120
+ ```
121
+
122
+ ### Multiple files
123
+
124
+ ```python
125
+ loader = XParseLoader(file_path=["a.pdf", "b.pdf"])
126
+ for doc in loader.lazy_load():
127
+ print(doc.metadata.get("source"), doc.page_content[:50])
128
+ ```
129
+
130
+ ### File-like object
131
+
132
+ When passing a file-like object instead of a path, you must set `metadata_filename`:
133
+
134
+ ```python
135
+ with open("doc.pdf", "rb") as f:
136
+ loader = XParseLoader(file=f, metadata_filename="doc.pdf")
137
+ docs = loader.load()
138
+ ```
139
+
140
+ ## References
141
+
142
+ - [xParse overview](https://docs.textin.com/pipeline/overview)
143
+ - [Pipeline API](https://docs.textin.com/api-reference/endpoint/pipeline)
@@ -0,0 +1,120 @@
1
+ # langchain-xparse
2
+
3
+ LangChain integration with [xParse Pipeline API](https://docs.textin.com/pipeline/overview) for document parsing, chunking and embedding. Supports parse / chunk / embed stages only (extract is not supported in this loader).
4
+
5
+ ## Installation
6
+
7
+ From PyPI:
8
+
9
+ ```bash
10
+ pip install langchain-xparse
11
+ ```
12
+
13
+ Local editable install:
14
+
15
+ ```bash
16
+ pip install -e .
17
+ ```
18
+
19
+ ## Configuration
20
+
21
+ Set your TextIn credentials (from [Textin Workspace](https://www.textin.com/console/dashboard/setting) ):
22
+
23
+ ```bash
24
+ export XPARSE_APP_ID="your-app-id"
25
+ export XPARSE_SECRET_CODE="your-secret-code"
26
+ ```
27
+
28
+ Or pass them when creating the loader:
29
+
30
+ ```python
31
+ loader = XParseLoader(
32
+ file_path="doc.pdf",
33
+ app_id="your-app-id",
34
+ secret_code="your-secret-code",
35
+ )
36
+ ```
37
+
38
+ ## Usage
39
+
40
+ ### Basic (parse only)
41
+
42
+ ```python
43
+ from langchain_xparse import XParseLoader
44
+
45
+ loader = XParseLoader(file_path="example.pdf")
46
+ docs = loader.load()
47
+ print(docs[0].page_content[:200])
48
+ print(docs[0].metadata) # source, category, element_id, filename, page_number, ...
49
+ ```
50
+
51
+ ### Lazy load
52
+
53
+ ```python
54
+ for doc in loader.lazy_load():
55
+ # process(doc)
56
+ ```
57
+
58
+ ### Async
59
+
60
+ ```python
61
+ async for doc in loader.alazy_load():
62
+ # process(doc)
63
+ ```
64
+
65
+ ### Convenience params (parse + chunk, or parse + chunk + embed)
66
+
67
+ ```python
68
+ loader = XParseLoader(
69
+ file_path="doc.pdf",
70
+ parse_provider="textin",
71
+ chunk_strategy="by_title",
72
+ chunk_max_characters=500,
73
+ chunk_overlap=50,
74
+ )
75
+ # Or with embed:
76
+ loader = XParseLoader(
77
+ file_path="doc.pdf",
78
+ parse_provider="textin",
79
+ chunk_strategy="basic",
80
+ chunk_max_characters=1000,
81
+ embed_provider="qwen",
82
+ embed_model_name="text-embedding-v4",
83
+ )
84
+ docs = loader.load()
85
+ ```
86
+
87
+ ### Custom stages (advanced)
88
+
89
+ ```python
90
+ loader = XParseLoader(
91
+ file_path="doc.pdf",
92
+ stages=[
93
+ {"type": "parse", "config": {"provider": "textin"}},
94
+ {"type": "chunk", "config": {"strategy": "by_page", "max_characters": 800}},
95
+ ],
96
+ )
97
+ ```
98
+
99
+ ### Multiple files
100
+
101
+ ```python
102
+ loader = XParseLoader(file_path=["a.pdf", "b.pdf"])
103
+ for doc in loader.lazy_load():
104
+ print(doc.metadata.get("source"), doc.page_content[:50])
105
+ ```
106
+
107
+ ### File-like object
108
+
109
+ When passing a file-like object instead of a path, you must set `metadata_filename`:
110
+
111
+ ```python
112
+ with open("doc.pdf", "rb") as f:
113
+ loader = XParseLoader(file=f, metadata_filename="doc.pdf")
114
+ docs = loader.load()
115
+ ```
116
+
117
+ ## References
118
+
119
+ - [xParse overview](https://docs.textin.com/pipeline/overview)
120
+ - [Pipeline API](https://docs.textin.com/api-reference/endpoint/pipeline)
@@ -0,0 +1,20 @@
1
+ """LangChain integration with xParse Pipeline API."""
2
+
3
+ from importlib import metadata
4
+
5
+ from langchain_xparse.client import XParseAPIError
6
+ from langchain_xparse.document_loaders import XParseLoader
7
+
8
+ try:
9
+ __version__ = metadata.version("langchain-xparse")
10
+ except metadata.PackageNotFoundError:
11
+ try:
12
+ __version__ = metadata.version(__package__ or "langchain_xparse")
13
+ except metadata.PackageNotFoundError:
14
+ __version__ = ""
15
+
16
+ __all__ = [
17
+ "XParseAPIError",
18
+ "XParseLoader",
19
+ "__version__",
20
+ ]
@@ -0,0 +1,92 @@
1
+ """xParse Pipeline API client: auth, request/response handling, sync and async."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ import httpx
9
+
10
+ DEFAULT_BASE_URL = "https://api.textin.com"
11
+ PIPELINE_PATH = "/api/xparse/pipeline"
12
+ DEFAULT_STAGES = [{"type": "parse", "config": {"provider": "textin"}}]
13
+
14
+
15
+ class XParseAPIError(Exception):
16
+ """Raised when the Pipeline API returns code != 200."""
17
+
18
+ def __init__(self, code: int, message: str, *args: Any, **kwargs: Any) -> None:
19
+ self.code = code
20
+ self.message = message
21
+ super().__init__(code, message, *args, **kwargs)
22
+
23
+
24
+ class PipelineClient:
25
+ """Client for xParse Pipeline API (sync and async)."""
26
+
27
+ def __init__(
28
+ self,
29
+ app_id: str,
30
+ secret_code: str,
31
+ base_url: str = DEFAULT_BASE_URL,
32
+ ) -> None:
33
+ self.app_id = app_id
34
+ self.secret_code = secret_code
35
+ self.base_url = base_url.rstrip("/")
36
+ self._headers = {
37
+ "x-ti-app-id": app_id,
38
+ "x-ti-secret-code": secret_code,
39
+ }
40
+
41
+ def _url(self) -> str:
42
+ return f"{self.base_url}{PIPELINE_PATH}"
43
+
44
+ def _parse_response(self, response: httpx.Response) -> list[dict[str, Any]]:
45
+ try:
46
+ data = response.json()
47
+ except Exception as e:
48
+ raise XParseAPIError(
49
+ response.status_code,
50
+ f"Invalid JSON response: {e}",
51
+ ) from e
52
+ code = data.get("code", response.status_code)
53
+ msg = data.get("message", "")
54
+ if code != 200:
55
+ raise XParseAPIError(code, msg or f"HTTP {response.status_code}")
56
+ # API may return elements at top level or nested in 'data' field
57
+ elements = data.get("elements")
58
+ if elements is None and "data" in data and isinstance(data["data"], dict):
59
+ elements = data["data"].get("elements")
60
+ return elements or []
61
+
62
+ def run_pipeline(
63
+ self,
64
+ file_content: bytes,
65
+ filename: str,
66
+ stages: list[dict[str, Any]] | None = None,
67
+ ) -> list[dict[str, Any]]:
68
+ """Execute the pipeline (sync). Returns list of elements."""
69
+ stages = stages or DEFAULT_STAGES
70
+ url = self._url()
71
+ files = {"file": (filename, file_content)}
72
+ data = {"stages": json.dumps(stages)}
73
+ with httpx.Client(timeout=120.0) as client:
74
+ resp = client.post(url, headers=self._headers, files=files, data=data)
75
+ resp.raise_for_status()
76
+ return self._parse_response(resp)
77
+
78
+ async def arun_pipeline(
79
+ self,
80
+ file_content: bytes,
81
+ filename: str,
82
+ stages: list[dict[str, Any]] | None = None,
83
+ ) -> list[dict[str, Any]]:
84
+ """Execute the pipeline (async). Returns list of elements."""
85
+ stages = stages or DEFAULT_STAGES
86
+ url = self._url()
87
+ files = {"file": (filename, file_content)}
88
+ data = {"stages": json.dumps(stages)}
89
+ async with httpx.AsyncClient(timeout=120.0) as client:
90
+ resp = await client.post(url, headers=self._headers, files=files, data=data)
91
+ resp.raise_for_status()
92
+ return self._parse_response(resp)
@@ -0,0 +1,274 @@
1
+ """xParse Pipeline document loader: XParseLoader and _SingleDocumentLoader."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, AsyncIterator, Callable, Iterator
8
+
9
+ from langchain_core.document_loaders.base import BaseLoader
10
+ from langchain_core.documents import Document
11
+
12
+ from langchain_xparse.client import DEFAULT_STAGES, PipelineClient
13
+
14
+
15
+ def _build_stages(
16
+ *,
17
+ parse_provider: str = "textin",
18
+ chunk_strategy: str | None = None,
19
+ chunk_max_characters: int | None = None,
20
+ chunk_overlap: int | None = None,
21
+ chunk_include_orig_elements: bool = False,
22
+ chunk_new_after_n_chars: int | None = None,
23
+ embed_provider: str | None = None,
24
+ embed_model_name: str | None = None,
25
+ **kwargs: Any,
26
+ ) -> list[dict[str, Any]]:
27
+ """Build Pipeline stages from convenience parameters (parse/chunk/embed only)."""
28
+ stages: list[dict[str, Any]] = [
29
+ {"type": "parse", "config": {"provider": parse_provider}}
30
+ ]
31
+ if chunk_strategy is not None:
32
+ chunk_config: dict[str, Any] = {"strategy": chunk_strategy}
33
+ if chunk_max_characters is not None:
34
+ chunk_config["max_characters"] = chunk_max_characters
35
+ if chunk_overlap is not None:
36
+ chunk_config["overlap"] = chunk_overlap
37
+ if chunk_include_orig_elements:
38
+ chunk_config["include_orig_elements"] = True
39
+ if chunk_new_after_n_chars is not None:
40
+ chunk_config["new_after_n_chars"] = chunk_new_after_n_chars
41
+ stages.append({"type": "chunk", "config": chunk_config})
42
+ if embed_provider is not None and embed_model_name is not None:
43
+ stages.append(
44
+ {
45
+ "type": "embed",
46
+ "config": {
47
+ "provider": embed_provider,
48
+ "model_name": embed_model_name,
49
+ },
50
+ }
51
+ )
52
+ return stages
53
+
54
+
55
+ class _SingleDocumentLoader(BaseLoader):
56
+ """Loads a single file via xParse Pipeline API into LangChain Documents."""
57
+
58
+ def __init__(
59
+ self,
60
+ *,
61
+ client: PipelineClient,
62
+ file_path: str | Path | None = None,
63
+ file: Any = None,
64
+ stages: list[dict[str, Any]],
65
+ post_processors: list[Callable[[str], str]] | None = None,
66
+ metadata_filename: str | None = None,
67
+ ) -> None:
68
+ self.client = client
69
+ self.file_path = str(file_path) if isinstance(file_path, Path) else file_path
70
+ self.file = file
71
+ self.stages = stages
72
+ self.post_processors = post_processors or []
73
+ self.metadata_filename = metadata_filename
74
+
75
+ def _file_content(self) -> bytes:
76
+ if self.file is not None:
77
+ return self.file.read()
78
+ if self.file_path:
79
+ with open(self.file_path, "rb") as f:
80
+ return f.read()
81
+ raise ValueError("file or file_path must be defined.")
82
+
83
+ def _filename(self) -> str:
84
+ if self.file_path:
85
+ return Path(self.file_path).name
86
+ if self.metadata_filename:
87
+ return self.metadata_filename
88
+ return "unknown"
89
+
90
+ def _source(self) -> str:
91
+ return self.file_path or self.metadata_filename or ""
92
+
93
+ def _element_to_document(self, element: dict[str, Any]) -> Document:
94
+ text = element.get("text") or ""
95
+ for fn in self.post_processors:
96
+ text = fn(text)
97
+ meta: dict[str, Any] = {
98
+ "source": self._source(),
99
+ "category": element.get("type"),
100
+ "element_id": element.get("element_id"),
101
+ }
102
+ if element.get("metadata"):
103
+ meta.update(element["metadata"])
104
+ if element.get("embeddings") is not None:
105
+ meta["embeddings"] = element["embeddings"]
106
+ return Document(page_content=text, metadata=meta)
107
+
108
+ def lazy_load(self) -> Iterator[Document]:
109
+ content = self._file_content()
110
+ filename = self._filename()
111
+ elements = self.client.run_pipeline(content, filename, self.stages)
112
+ for el in elements:
113
+ yield self._element_to_document(el)
114
+
115
+ async def alazy_load(self) -> AsyncIterator[Document]:
116
+ content = self._file_content()
117
+ filename = self._filename()
118
+ elements = await self.client.arun_pipeline(content, filename, self.stages)
119
+ for el in elements:
120
+ yield self._element_to_document(el)
121
+
122
+
123
+ class XParseLoader(BaseLoader):
124
+ """Load documents via xParse Pipeline API (parse/chunk/embed; no extract).
125
+
126
+ Setup:
127
+ Set environment variables or pass credentials:
128
+
129
+ ```bash
130
+ export XPARSE_APP_ID="your-app-id"
131
+ export XPARSE_SECRET_CODE="your-secret-code"
132
+ ```
133
+
134
+ Example:
135
+ ```python
136
+ from langchain_xparse import XParseLoader
137
+
138
+ loader = XParseLoader(file_path="example.pdf")
139
+ docs = loader.load()
140
+
141
+ # With convenience params (parse + chunk):
142
+ loader = XParseLoader(
143
+ file_path="doc.pdf",
144
+ parse_provider="textin",
145
+ chunk_strategy="by_title",
146
+ chunk_max_characters=500,
147
+ )
148
+ for doc in loader.lazy_load():
149
+ print(doc.page_content[:100], doc.metadata)
150
+ ```
151
+ """
152
+
153
+ def __init__(
154
+ self,
155
+ file_path: str | Path | list[str] | list[Path] | None = None,
156
+ *,
157
+ file: Any = None,
158
+ app_id: str | None = None,
159
+ secret_code: str | None = None,
160
+ base_url: str | None = None,
161
+ stages: list[dict[str, Any]] | None = None,
162
+ post_processors: list[Callable[[str], str]] | None = None,
163
+ metadata_filename: str | None = None,
164
+ parse_provider: str = "textin",
165
+ chunk_strategy: str | None = None,
166
+ chunk_max_characters: int | None = None,
167
+ chunk_overlap: int | None = None,
168
+ chunk_include_orig_elements: bool = False,
169
+ chunk_new_after_n_chars: int | None = None,
170
+ embed_provider: str | None = None,
171
+ embed_model_name: str | None = None,
172
+ **kwargs: Any,
173
+ ) -> None:
174
+ if file_path is not None and file is not None:
175
+ raise ValueError("file_path and file cannot be defined simultaneously.")
176
+ if file is not None and metadata_filename is None:
177
+ raise ValueError(
178
+ "When using file (file-like), metadata_filename must be specified."
179
+ )
180
+
181
+ self._app_id = app_id or os.getenv("XPARSE_APP_ID") or ""
182
+ self._secret_code = secret_code or os.getenv("XPARSE_SECRET_CODE") or ""
183
+ self._base_url = base_url or "https://api.textin.com"
184
+ self.file_path = file_path
185
+ self.file = file
186
+ self.post_processors = post_processors or []
187
+ self.metadata_filename = metadata_filename
188
+
189
+ if stages is not None:
190
+ self._stages = stages
191
+ else:
192
+ self._stages = _build_stages(
193
+ parse_provider=parse_provider,
194
+ chunk_strategy=chunk_strategy,
195
+ chunk_max_characters=chunk_max_characters,
196
+ chunk_overlap=chunk_overlap,
197
+ chunk_include_orig_elements=chunk_include_orig_elements,
198
+ chunk_new_after_n_chars=chunk_new_after_n_chars,
199
+ embed_provider=embed_provider,
200
+ embed_model_name=embed_model_name,
201
+ **kwargs,
202
+ )
203
+
204
+ self._client = PipelineClient(
205
+ app_id=self._app_id,
206
+ secret_code=self._secret_code,
207
+ base_url=self._base_url,
208
+ )
209
+
210
+ def _load_one(
211
+ self,
212
+ f_path: str | Path | None = None,
213
+ f: Any = None,
214
+ meta_filename: str | None = None,
215
+ ) -> Iterator[Document]:
216
+ single = _SingleDocumentLoader(
217
+ client=self._client,
218
+ file_path=str(f_path) if f_path is not None else None,
219
+ file=f,
220
+ stages=self._stages,
221
+ post_processors=self.post_processors,
222
+ metadata_filename=meta_filename,
223
+ )
224
+ yield from single.lazy_load()
225
+
226
+ async def _aload_one(
227
+ self,
228
+ f_path: str | Path | None = None,
229
+ f: Any = None,
230
+ meta_filename: str | None = None,
231
+ ) -> AsyncIterator[Document]:
232
+ single = _SingleDocumentLoader(
233
+ client=self._client,
234
+ file_path=str(f_path) if f_path is not None else None,
235
+ file=f,
236
+ stages=self._stages,
237
+ post_processors=self.post_processors,
238
+ metadata_filename=meta_filename,
239
+ )
240
+ async for doc in single.alazy_load():
241
+ yield doc
242
+
243
+ def lazy_load(self) -> Iterator[Document]:
244
+ if isinstance(self.file, list):
245
+ for f in self.file:
246
+ yield from self._load_one(f=f, meta_filename=self.metadata_filename)
247
+ return
248
+ if isinstance(self.file_path, list):
249
+ for p in self.file_path:
250
+ yield from self._load_one(f_path=p)
251
+ return
252
+ yield from self._load_one(
253
+ f_path=self.file_path,
254
+ f=self.file,
255
+ meta_filename=self.metadata_filename,
256
+ )
257
+
258
+ async def alazy_load(self) -> AsyncIterator[Document]:
259
+ if isinstance(self.file, list):
260
+ for f in self.file:
261
+ async for doc in self._aload_one(f=f, meta_filename=self.metadata_filename):
262
+ yield doc
263
+ return
264
+ if isinstance(self.file_path, list):
265
+ for p in self.file_path:
266
+ async for doc in self._aload_one(f_path=p):
267
+ yield doc
268
+ return
269
+ async for doc in self._aload_one(
270
+ f_path=self.file_path,
271
+ f=self.file,
272
+ meta_filename=self.metadata_filename,
273
+ ):
274
+ yield doc
File without changes
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: langchain-xparse
3
+ Version: 1.0.0
4
+ Summary: LangChain integration with xParse Pipeline API for document parsing, chunking and embedding
5
+ Author-email: intsig-textin <wangxuetongxztx@163.com>
6
+ Maintainer-email: intsig-textin <wangxuetongxztx@163.com>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/intsig-textin/langchain-xparse
9
+ Project-URL: Source, https://github.com/intsig-textin/langchain-xparse
10
+ Project-URL: Documentation, https://docs.textin.com/pipeline/overview
11
+ Project-URL: Bug Tracker, https://github.com/intsig-textin/langchain-xparse/issues
12
+ Keywords: langchain,xparse,document-loader,textin,parsing
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: langchain-core>=1.0
17
+ Requires-Dist: httpx>=0.24
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=7; extra == "dev"
20
+ Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
21
+ Requires-Dist: python-dotenv>=1.0; extra == "dev"
22
+ Dynamic: license-file
23
+
24
+ # langchain-xparse
25
+
26
+ LangChain integration with [xParse Pipeline API](https://docs.textin.com/pipeline/overview) for document parsing, chunking and embedding. Supports parse / chunk / embed stages only (extract is not supported in this loader).
27
+
28
+ ## Installation
29
+
30
+ From PyPI:
31
+
32
+ ```bash
33
+ pip install langchain-xparse
34
+ ```
35
+
36
+ Local editable install:
37
+
38
+ ```bash
39
+ pip install -e .
40
+ ```
41
+
42
+ ## Configuration
43
+
44
+ Set your TextIn credentials (from [Textin Workspace](https://www.textin.com/console/dashboard/setting) ):
45
+
46
+ ```bash
47
+ export XPARSE_APP_ID="your-app-id"
48
+ export XPARSE_SECRET_CODE="your-secret-code"
49
+ ```
50
+
51
+ Or pass them when creating the loader:
52
+
53
+ ```python
54
+ loader = XParseLoader(
55
+ file_path="doc.pdf",
56
+ app_id="your-app-id",
57
+ secret_code="your-secret-code",
58
+ )
59
+ ```
60
+
61
+ ## Usage
62
+
63
+ ### Basic (parse only)
64
+
65
+ ```python
66
+ from langchain_xparse import XParseLoader
67
+
68
+ loader = XParseLoader(file_path="example.pdf")
69
+ docs = loader.load()
70
+ print(docs[0].page_content[:200])
71
+ print(docs[0].metadata) # source, category, element_id, filename, page_number, ...
72
+ ```
73
+
74
+ ### Lazy load
75
+
76
+ ```python
77
+ for doc in loader.lazy_load():
78
+ # process(doc)
79
+ ```
80
+
81
+ ### Async
82
+
83
+ ```python
84
+ async for doc in loader.alazy_load():
85
+ # process(doc)
86
+ ```
87
+
88
+ ### Convenience params (parse + chunk, or parse + chunk + embed)
89
+
90
+ ```python
91
+ loader = XParseLoader(
92
+ file_path="doc.pdf",
93
+ parse_provider="textin",
94
+ chunk_strategy="by_title",
95
+ chunk_max_characters=500,
96
+ chunk_overlap=50,
97
+ )
98
+ # Or with embed:
99
+ loader = XParseLoader(
100
+ file_path="doc.pdf",
101
+ parse_provider="textin",
102
+ chunk_strategy="basic",
103
+ chunk_max_characters=1000,
104
+ embed_provider="qwen",
105
+ embed_model_name="text-embedding-v4",
106
+ )
107
+ docs = loader.load()
108
+ ```
109
+
110
+ ### Custom stages (advanced)
111
+
112
+ ```python
113
+ loader = XParseLoader(
114
+ file_path="doc.pdf",
115
+ stages=[
116
+ {"type": "parse", "config": {"provider": "textin"}},
117
+ {"type": "chunk", "config": {"strategy": "by_page", "max_characters": 800}},
118
+ ],
119
+ )
120
+ ```
121
+
122
+ ### Multiple files
123
+
124
+ ```python
125
+ loader = XParseLoader(file_path=["a.pdf", "b.pdf"])
126
+ for doc in loader.lazy_load():
127
+ print(doc.metadata.get("source"), doc.page_content[:50])
128
+ ```
129
+
130
+ ### File-like object
131
+
132
+ When passing a file-like object instead of a path, you must set `metadata_filename`:
133
+
134
+ ```python
135
+ with open("doc.pdf", "rb") as f:
136
+ loader = XParseLoader(file=f, metadata_filename="doc.pdf")
137
+ docs = loader.load()
138
+ ```
139
+
140
+ ## References
141
+
142
+ - [xParse overview](https://docs.textin.com/pipeline/overview)
143
+ - [Pipeline API](https://docs.textin.com/api-reference/endpoint/pipeline)
@@ -0,0 +1,12 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ langchain_xparse/__init__.py
5
+ langchain_xparse/client.py
6
+ langchain_xparse/document_loaders.py
7
+ langchain_xparse/py.typed
8
+ langchain_xparse.egg-info/PKG-INFO
9
+ langchain_xparse.egg-info/SOURCES.txt
10
+ langchain_xparse.egg-info/dependency_links.txt
11
+ langchain_xparse.egg-info/requires.txt
12
+ langchain_xparse.egg-info/top_level.txt
@@ -0,0 +1,7 @@
1
+ langchain-core>=1.0
2
+ httpx>=0.24
3
+
4
+ [dev]
5
+ pytest>=7
6
+ pytest-asyncio>=0.21
7
+ python-dotenv>=1.0
@@ -0,0 +1 @@
1
+ langchain_xparse
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "langchain-xparse"
7
+ version = "1.0.0"
8
+ description = "LangChain integration with xParse Pipeline API for document parsing, chunking and embedding"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "intsig-textin", email = "wangxuetongxztx@163.com"}
14
+ ]
15
+ maintainers = [
16
+ {name = "intsig-textin", email = "wangxuetongxztx@163.com"}
17
+ ]
18
+ keywords = ["langchain", "xparse", "document-loader", "textin", "parsing"]
19
+ dependencies = [
20
+ "langchain-core>=1.0",
21
+ "httpx>=0.24",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ dev = [
26
+ "pytest>=7",
27
+ "pytest-asyncio>=0.21",
28
+ "python-dotenv>=1.0",
29
+ ]
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/intsig-textin/langchain-xparse"
33
+ Source = "https://github.com/intsig-textin/langchain-xparse"
34
+ Documentation = "https://docs.textin.com/pipeline/overview"
35
+ "Bug Tracker" = "https://github.com/intsig-textin/langchain-xparse/issues"
36
+
37
+ [tool.setuptools.packages.find]
38
+ where = ["."]
39
+ include = ["langchain_xparse*"]
40
+
41
+ [tool.pytest.ini_options]
42
+ asyncio_mode = "auto"
43
+ testpaths = ["tests"]
44
+ markers = [
45
+ "integration: marks tests as integration (require XPARSE_APP_ID, XPARSE_SECRET_CODE)",
46
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+