docpipe-sdk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpipe_sdk-0.1.0/.github/dependabot.yml +35 -0
- docpipe_sdk-0.1.0/.github/workflows/ci.yml +35 -0
- docpipe_sdk-0.1.0/.github/workflows/publish.yml +48 -0
- docpipe_sdk-0.1.0/.gitignore +18 -0
- docpipe_sdk-0.1.0/CHANGELOG.md +27 -0
- docpipe_sdk-0.1.0/Dockerfile +19 -0
- docpipe_sdk-0.1.0/LICENSE +21 -0
- docpipe_sdk-0.1.0/PKG-INFO +170 -0
- docpipe_sdk-0.1.0/README.md +102 -0
- docpipe_sdk-0.1.0/docpipe.example.yaml +29 -0
- docpipe_sdk-0.1.0/pyproject.toml +100 -0
- docpipe_sdk-0.1.0/scripts/release.sh +32 -0
- docpipe_sdk-0.1.0/site/index.html +586 -0
- docpipe_sdk-0.1.0/site/vercel.json +15 -0
- docpipe_sdk-0.1.0/src/docpipe/__init__.py +150 -0
- docpipe_sdk-0.1.0/src/docpipe/_version.py +1 -0
- docpipe_sdk-0.1.0/src/docpipe/cli/__init__.py +0 -0
- docpipe_sdk-0.1.0/src/docpipe/cli/main.py +308 -0
- docpipe_sdk-0.1.0/src/docpipe/config/__init__.py +0 -0
- docpipe_sdk-0.1.0/src/docpipe/config/loader.py +54 -0
- docpipe_sdk-0.1.0/src/docpipe/config/settings.py +41 -0
- docpipe_sdk-0.1.0/src/docpipe/core/__init__.py +0 -0
- docpipe_sdk-0.1.0/src/docpipe/core/errors.py +41 -0
- docpipe_sdk-0.1.0/src/docpipe/core/extractor.py +37 -0
- docpipe_sdk-0.1.0/src/docpipe/core/parser.py +36 -0
- docpipe_sdk-0.1.0/src/docpipe/core/pipeline.py +137 -0
- docpipe_sdk-0.1.0/src/docpipe/core/types.py +106 -0
- docpipe_sdk-0.1.0/src/docpipe/extractors/__init__.py +0 -0
- docpipe_sdk-0.1.0/src/docpipe/extractors/langchain_extractor.py +164 -0
- docpipe_sdk-0.1.0/src/docpipe/extractors/langextract_extractor.py +106 -0
- docpipe_sdk-0.1.0/src/docpipe/ingestion/__init__.py +0 -0
- docpipe_sdk-0.1.0/src/docpipe/ingestion/pipeline.py +206 -0
- docpipe_sdk-0.1.0/src/docpipe/parsers/__init__.py +0 -0
- docpipe_sdk-0.1.0/src/docpipe/parsers/docling_parser.py +136 -0
- docpipe_sdk-0.1.0/src/docpipe/py.typed +0 -0
- docpipe_sdk-0.1.0/src/docpipe/registry/__init__.py +0 -0
- docpipe_sdk-0.1.0/src/docpipe/registry/registry.py +120 -0
- docpipe_sdk-0.1.0/src/docpipe/server/__init__.py +0 -0
- docpipe_sdk-0.1.0/src/docpipe/server/app.py +239 -0
- docpipe_sdk-0.1.0/tests/__init__.py +0 -0
- docpipe_sdk-0.1.0/tests/conftest.py +117 -0
- docpipe_sdk-0.1.0/tests/integration/__init__.py +0 -0
- docpipe_sdk-0.1.0/tests/unit/__init__.py +0 -0
- docpipe_sdk-0.1.0/tests/unit/test_config.py +54 -0
- docpipe_sdk-0.1.0/tests/unit/test_ingestion.py +77 -0
- docpipe_sdk-0.1.0/tests/unit/test_pipeline.py +63 -0
- docpipe_sdk-0.1.0/tests/unit/test_registry.py +95 -0
- docpipe_sdk-0.1.0/tests/unit/test_types.py +128 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
version: 2
|
|
2
|
+
|
|
3
|
+
updates:
|
|
4
|
+
# Python dependencies
|
|
5
|
+
- package-ecosystem: pip
|
|
6
|
+
directory: "/"
|
|
7
|
+
schedule:
|
|
8
|
+
interval: weekly
|
|
9
|
+
day: monday
|
|
10
|
+
open-pull-requests-limit: 10
|
|
11
|
+
labels:
|
|
12
|
+
- "dependencies"
|
|
13
|
+
- "python"
|
|
14
|
+
groups:
|
|
15
|
+
langchain:
|
|
16
|
+
patterns:
|
|
17
|
+
- "langchain-*"
|
|
18
|
+
update-types:
|
|
19
|
+
- "minor"
|
|
20
|
+
- "patch"
|
|
21
|
+
docling:
|
|
22
|
+
patterns:
|
|
23
|
+
- "docling*"
|
|
24
|
+
langextract:
|
|
25
|
+
patterns:
|
|
26
|
+
- "langextract*"
|
|
27
|
+
|
|
28
|
+
# GitHub Actions
|
|
29
|
+
- package-ecosystem: github-actions
|
|
30
|
+
directory: "/"
|
|
31
|
+
schedule:
|
|
32
|
+
interval: weekly
|
|
33
|
+
labels:
|
|
34
|
+
- "dependencies"
|
|
35
|
+
- "ci"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v6
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v6
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: pip install -e ".[dev]"
|
|
26
|
+
|
|
27
|
+
- name: Lint with ruff
|
|
28
|
+
run: ruff check src/
|
|
29
|
+
|
|
30
|
+
- name: Run unit tests
|
|
31
|
+
run: pytest tests/unit/ -v --tb=short
|
|
32
|
+
|
|
33
|
+
- name: Check types with mypy
|
|
34
|
+
run: mypy src/docpipe/ --ignore-missing-imports
|
|
35
|
+
continue-on-error: true
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
id-token: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
build:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v6
|
|
16
|
+
|
|
17
|
+
- name: Set up Python
|
|
18
|
+
uses: actions/setup-python@v6
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
|
|
22
|
+
- name: Install build tools
|
|
23
|
+
run: pip install build
|
|
24
|
+
|
|
25
|
+
- name: Build package
|
|
26
|
+
run: python -m build
|
|
27
|
+
|
|
28
|
+
- name: Upload artifacts
|
|
29
|
+
uses: actions/upload-artifact@v7
|
|
30
|
+
with:
|
|
31
|
+
name: dist
|
|
32
|
+
path: dist/
|
|
33
|
+
|
|
34
|
+
publish:
|
|
35
|
+
needs: build
|
|
36
|
+
runs-on: ubuntu-latest
|
|
37
|
+
environment: pypi
|
|
38
|
+
permissions:
|
|
39
|
+
id-token: write
|
|
40
|
+
steps:
|
|
41
|
+
- name: Download artifacts
|
|
42
|
+
uses: actions/download-artifact@v8
|
|
43
|
+
with:
|
|
44
|
+
name: dist
|
|
45
|
+
path: dist/
|
|
46
|
+
|
|
47
|
+
- name: Publish to PyPI
|
|
48
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-04-04
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Core pipeline architecture with Protocol-based parser and extractor interfaces
|
|
15
|
+
- Docling parser adapter for document parsing (PDF, DOCX, images, audio, video)
|
|
16
|
+
- LangExtract extractor adapter for LLM-based structured extraction
|
|
17
|
+
- LangChain extractor adapter using `with_structured_output()`
|
|
18
|
+
- Ingestion pipeline with LangChain text splitters, embeddings, and PGVector
|
|
19
|
+
- Plugin registry with `importlib.metadata` entry-point auto-discovery
|
|
20
|
+
- Configuration via Pydantic Settings (env vars + YAML files)
|
|
21
|
+
- CLI commands: `parse`, `extract`, `run`, `ingest`, `search`, `serve`, `plugins`, `config`
|
|
22
|
+
- FastAPI server with REST endpoints for all pipeline operations
|
|
23
|
+
- Dockerfile for containerized deployment
|
|
24
|
+
- 34 unit tests with mock parser/extractor
|
|
25
|
+
|
|
26
|
+
[Unreleased]: https://github.com/thesunnysinha/docpipe/compare/v0.1.0...HEAD
|
|
27
|
+
[0.1.0]: https://github.com/thesunnysinha/docpipe/releases/tag/v0.1.0
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
WORKDIR /app
|
|
4
|
+
|
|
5
|
+
# Install system dependencies for document processing
|
|
6
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
7
|
+
libgl1 \
|
|
8
|
+
libglib2.0-0 \
|
|
9
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
10
|
+
|
|
11
|
+
COPY pyproject.toml README.md LICENSE ./
|
|
12
|
+
COPY src/ ./src/
|
|
13
|
+
|
|
14
|
+
RUN pip install --no-cache-dir ".[all,server]"
|
|
15
|
+
|
|
16
|
+
ENTRYPOINT ["docpipe"]
|
|
17
|
+
CMD ["serve", "--host", "0.0.0.0", "--port", "8000"]
|
|
18
|
+
|
|
19
|
+
EXPOSE 8000
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sunny Sinha
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docpipe-sdk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified document parsing, structured extraction, and vector ingestion pipeline
|
|
5
|
+
Project-URL: Homepage, https://docpipe.vercel.app
|
|
6
|
+
Project-URL: Repository, https://github.com/thesunnysinha/docpipe
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/thesunnysinha/docpipe/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/thesunnysinha/docpipe/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Sunny Sinha <thesunnysinha@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: docling,document,extraction,ingestion,langchain,langextract,llm,parsing,pipeline,rag,vector
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Classifier: Topic :: Text Processing
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.10
|
|
26
|
+
Requires-Dist: click>=8.0
|
|
27
|
+
Requires-Dist: langchain-core>=0.3
|
|
28
|
+
Requires-Dist: langchain-text-splitters>=0.3
|
|
29
|
+
Requires-Dist: pydantic-settings>=2.0
|
|
30
|
+
Requires-Dist: pydantic>=2.0
|
|
31
|
+
Requires-Dist: pyyaml>=6.0
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: docling>=2.0; extra == 'all'
|
|
34
|
+
Requires-Dist: fastapi>=0.100; extra == 'all'
|
|
35
|
+
Requires-Dist: langchain-google-genai>=2.0; extra == 'all'
|
|
36
|
+
Requires-Dist: langchain-ollama>=0.3; extra == 'all'
|
|
37
|
+
Requires-Dist: langchain-openai>=0.3; extra == 'all'
|
|
38
|
+
Requires-Dist: langchain-postgres>=0.0.12; extra == 'all'
|
|
39
|
+
Requires-Dist: langextract>=0.1; extra == 'all'
|
|
40
|
+
Requires-Dist: python-multipart>=0.0.6; extra == 'all'
|
|
41
|
+
Requires-Dist: uvicorn[standard]>=0.20; extra == 'all'
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: httpx; extra == 'dev'
|
|
44
|
+
Requires-Dist: mypy; extra == 'dev'
|
|
45
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
|
|
46
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
47
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
48
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
49
|
+
Provides-Extra: docling
|
|
50
|
+
Requires-Dist: docling>=2.0; extra == 'docling'
|
|
51
|
+
Provides-Extra: google
|
|
52
|
+
Requires-Dist: langchain-google-genai>=2.0; extra == 'google'
|
|
53
|
+
Provides-Extra: huggingface
|
|
54
|
+
Requires-Dist: langchain-huggingface>=0.1; extra == 'huggingface'
|
|
55
|
+
Provides-Extra: langextract
|
|
56
|
+
Requires-Dist: langextract>=0.1; extra == 'langextract'
|
|
57
|
+
Provides-Extra: ollama
|
|
58
|
+
Requires-Dist: langchain-ollama>=0.3; extra == 'ollama'
|
|
59
|
+
Provides-Extra: openai
|
|
60
|
+
Requires-Dist: langchain-openai>=0.3; extra == 'openai'
|
|
61
|
+
Provides-Extra: pgvector
|
|
62
|
+
Requires-Dist: langchain-postgres>=0.0.12; extra == 'pgvector'
|
|
63
|
+
Provides-Extra: server
|
|
64
|
+
Requires-Dist: fastapi>=0.100; extra == 'server'
|
|
65
|
+
Requires-Dist: python-multipart>=0.0.6; extra == 'server'
|
|
66
|
+
Requires-Dist: uvicorn[standard]>=0.20; extra == 'server'
|
|
67
|
+
Description-Content-Type: text/markdown
|
|
68
|
+
|
|
69
|
+
# docpipe
|
|
70
|
+
|
|
71
|
+
Unified document parsing, structured extraction, and vector ingestion pipeline.
|
|
72
|
+
|
|
73
|
+
## Overview
|
|
74
|
+
|
|
75
|
+
docpipe connects document parsing (Docling), LLM-based structured extraction (LangExtract + LangChain), and vector ingestion (pgvector via LangChain) into a single composable pipeline.
|
|
76
|
+
|
|
77
|
+
**Three independent pipelines, composable together:**
|
|
78
|
+
|
|
79
|
+
1. **Parse**: Unstructured docs → parsed text/markdown (Docling)
|
|
80
|
+
2. **Extract**: Text → structured entities via LLM (LangExtract or LangChain)
|
|
81
|
+
3. **Ingest**: Parsed chunks → embeddings → your vector DB (LangChain + pgvector)
|
|
82
|
+
|
|
83
|
+
## Install
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Core only
|
|
87
|
+
pip install docpipe
|
|
88
|
+
|
|
89
|
+
# With all backends
|
|
90
|
+
pip install "docpipe[all]"
|
|
91
|
+
|
|
92
|
+
# Pick what you need
|
|
93
|
+
pip install "docpipe[docling]" # Document parsing
|
|
94
|
+
pip install "docpipe[langextract]" # Google LangExtract
|
|
95
|
+
pip install "docpipe[openai]" # OpenAI embeddings + LLM
|
|
96
|
+
pip install "docpipe[pgvector]" # PostgreSQL vector store
|
|
97
|
+
pip install "docpipe[server]" # FastAPI server
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Quick Start
|
|
101
|
+
|
|
102
|
+
### Python API
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
import docpipe
|
|
106
|
+
|
|
107
|
+
# Parse a document
|
|
108
|
+
doc = docpipe.parse("invoice.pdf")
|
|
109
|
+
print(doc.markdown)
|
|
110
|
+
|
|
111
|
+
# Extract structured data
|
|
112
|
+
schema = docpipe.ExtractionSchema(
|
|
113
|
+
description="Extract invoice line items with amounts",
|
|
114
|
+
model_id="gemini-2.5-flash",
|
|
115
|
+
)
|
|
116
|
+
results = docpipe.extract(doc.text, schema)
|
|
117
|
+
|
|
118
|
+
# Full pipeline
|
|
119
|
+
result = docpipe.run("invoice.pdf", schema)
|
|
120
|
+
|
|
121
|
+
# Ingest into your vector DB
|
|
122
|
+
config = docpipe.IngestionConfig(
|
|
123
|
+
connection_string="postgresql://user:pass@localhost:5432/mydb",
|
|
124
|
+
table_name="invoices",
|
|
125
|
+
embedding_provider="openai",
|
|
126
|
+
embedding_model="text-embedding-3-small",
|
|
127
|
+
)
|
|
128
|
+
docpipe.ingest("invoice.pdf", config=config)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### CLI
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
docpipe parse invoice.pdf --format markdown
|
|
135
|
+
docpipe extract "John Doe, age 30" --schema schema.yaml --model gemini-2.5-flash
|
|
136
|
+
docpipe run invoice.pdf --schema schema.yaml --model gemini-2.5-flash
|
|
137
|
+
docpipe ingest invoice.pdf --db "postgresql://..." --table invoices \
|
|
138
|
+
--embedding-provider openai --embedding-model text-embedding-3-small
|
|
139
|
+
docpipe search "total amount" --db "postgresql://..." --table invoices \
|
|
140
|
+
--embedding-provider openai --embedding-model text-embedding-3-small
|
|
141
|
+
docpipe serve
|
|
142
|
+
docpipe plugins list
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Docker
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# API server
|
|
149
|
+
docker run -p 8000:8000 --env-file .env docpipe
|
|
150
|
+
|
|
151
|
+
# CLI
|
|
152
|
+
docker run -v ./data:/data docpipe parse /data/invoice.pdf
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Plugin System
|
|
156
|
+
|
|
157
|
+
Third-party packages can register as plugins via entry points:
|
|
158
|
+
|
|
159
|
+
```toml
|
|
160
|
+
# In your package's pyproject.toml
|
|
161
|
+
[project.entry-points."docpipe.parsers"]
|
|
162
|
+
my_parser = "my_package:MyParser"
|
|
163
|
+
|
|
164
|
+
[project.entry-points."docpipe.extractors"]
|
|
165
|
+
my_extractor = "my_package:MyExtractor"
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## License
|
|
169
|
+
|
|
170
|
+
MIT
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# docpipe
|
|
2
|
+
|
|
3
|
+
Unified document parsing, structured extraction, and vector ingestion pipeline.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
docpipe connects document parsing (Docling), LLM-based structured extraction (LangExtract + LangChain), and vector ingestion (pgvector via LangChain) into a single composable pipeline.
|
|
8
|
+
|
|
9
|
+
**Three independent pipelines, composable together:**
|
|
10
|
+
|
|
11
|
+
1. **Parse**: Unstructured docs → parsed text/markdown (Docling)
|
|
12
|
+
2. **Extract**: Text → structured entities via LLM (LangExtract or LangChain)
|
|
13
|
+
3. **Ingest**: Parsed chunks → embeddings → your vector DB (LangChain + pgvector)
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Core only
|
|
19
|
+
pip install docpipe
|
|
20
|
+
|
|
21
|
+
# With all backends
|
|
22
|
+
pip install "docpipe[all]"
|
|
23
|
+
|
|
24
|
+
# Pick what you need
|
|
25
|
+
pip install "docpipe[docling]" # Document parsing
|
|
26
|
+
pip install "docpipe[langextract]" # Google LangExtract
|
|
27
|
+
pip install "docpipe[openai]" # OpenAI embeddings + LLM
|
|
28
|
+
pip install "docpipe[pgvector]" # PostgreSQL vector store
|
|
29
|
+
pip install "docpipe[server]" # FastAPI server
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quick Start
|
|
33
|
+
|
|
34
|
+
### Python API
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
import docpipe
|
|
38
|
+
|
|
39
|
+
# Parse a document
|
|
40
|
+
doc = docpipe.parse("invoice.pdf")
|
|
41
|
+
print(doc.markdown)
|
|
42
|
+
|
|
43
|
+
# Extract structured data
|
|
44
|
+
schema = docpipe.ExtractionSchema(
|
|
45
|
+
description="Extract invoice line items with amounts",
|
|
46
|
+
model_id="gemini-2.5-flash",
|
|
47
|
+
)
|
|
48
|
+
results = docpipe.extract(doc.text, schema)
|
|
49
|
+
|
|
50
|
+
# Full pipeline
|
|
51
|
+
result = docpipe.run("invoice.pdf", schema)
|
|
52
|
+
|
|
53
|
+
# Ingest into your vector DB
|
|
54
|
+
config = docpipe.IngestionConfig(
|
|
55
|
+
connection_string="postgresql://user:pass@localhost:5432/mydb",
|
|
56
|
+
table_name="invoices",
|
|
57
|
+
embedding_provider="openai",
|
|
58
|
+
embedding_model="text-embedding-3-small",
|
|
59
|
+
)
|
|
60
|
+
docpipe.ingest("invoice.pdf", config=config)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### CLI
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
docpipe parse invoice.pdf --format markdown
|
|
67
|
+
docpipe extract "John Doe, age 30" --schema schema.yaml --model gemini-2.5-flash
|
|
68
|
+
docpipe run invoice.pdf --schema schema.yaml --model gemini-2.5-flash
|
|
69
|
+
docpipe ingest invoice.pdf --db "postgresql://..." --table invoices \
|
|
70
|
+
--embedding-provider openai --embedding-model text-embedding-3-small
|
|
71
|
+
docpipe search "total amount" --db "postgresql://..." --table invoices \
|
|
72
|
+
--embedding-provider openai --embedding-model text-embedding-3-small
|
|
73
|
+
docpipe serve
|
|
74
|
+
docpipe plugins list
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Docker
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# API server
|
|
81
|
+
docker run -p 8000:8000 --env-file .env docpipe
|
|
82
|
+
|
|
83
|
+
# CLI
|
|
84
|
+
docker run -v ./data:/data docpipe parse /data/invoice.pdf
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Plugin System
|
|
88
|
+
|
|
89
|
+
Third-party packages can register as plugins via entry points:
|
|
90
|
+
|
|
91
|
+
```toml
|
|
92
|
+
# In your package's pyproject.toml
|
|
93
|
+
[project.entry-points."docpipe.parsers"]
|
|
94
|
+
my_parser = "my_package:MyParser"
|
|
95
|
+
|
|
96
|
+
[project.entry-points."docpipe.extractors"]
|
|
97
|
+
my_extractor = "my_package:MyExtractor"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## License
|
|
101
|
+
|
|
102
|
+
MIT
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# docpipe configuration
|
|
2
|
+
# Copy to docpipe.yaml and customize
|
|
3
|
+
|
|
4
|
+
# Parser settings
|
|
5
|
+
default_parser: docling
|
|
6
|
+
parser_options: {}
|
|
7
|
+
|
|
8
|
+
# Extractor settings
|
|
9
|
+
default_extractor: langextract
|
|
10
|
+
extractor_options: {}
|
|
11
|
+
|
|
12
|
+
# Ingestion settings (provide your own DB connection)
|
|
13
|
+
# db_connection_string: postgresql://user:pass@host:5432/dbname
|
|
14
|
+
# db_table_name: docpipe_documents
|
|
15
|
+
# embedding_provider: openai
|
|
16
|
+
# embedding_model: text-embedding-3-small
|
|
17
|
+
# chunk_size: 1000
|
|
18
|
+
# chunk_overlap: 200
|
|
19
|
+
# ingest_mode: both
|
|
20
|
+
|
|
21
|
+
# Server settings
|
|
22
|
+
server_host: "0.0.0.0"
|
|
23
|
+
server_port: 8000
|
|
24
|
+
|
|
25
|
+
# Pipeline settings
|
|
26
|
+
max_concurrency: 4
|
|
27
|
+
|
|
28
|
+
# Logging
|
|
29
|
+
log_level: INFO
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.26"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "docpipe-sdk"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Unified document parsing, structured extraction, and vector ingestion pipeline"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
license-files = ["LICENSE"]
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
|
+
authors = [{ name = "Sunny Sinha", email = "thesunnysinha@gmail.com" }]
|
|
14
|
+
keywords = ["document", "parsing", "extraction", "llm", "pipeline", "vector", "ingestion", "rag", "docling", "langextract", "langchain"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
25
|
+
"Topic :: Text Processing",
|
|
26
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
27
|
+
"Typing :: Typed",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
dependencies = [
|
|
31
|
+
"pydantic>=2.0",
|
|
32
|
+
"pydantic-settings>=2.0",
|
|
33
|
+
"pyyaml>=6.0",
|
|
34
|
+
"click>=8.0",
|
|
35
|
+
"langchain-core>=0.3",
|
|
36
|
+
"langchain-text-splitters>=0.3",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.optional-dependencies]
|
|
40
|
+
docling = ["docling>=2.0"]
|
|
41
|
+
langextract = ["langextract>=0.1"]
|
|
42
|
+
openai = ["langchain-openai>=0.3"]
|
|
43
|
+
google = ["langchain-google-genai>=2.0"]
|
|
44
|
+
ollama = ["langchain-ollama>=0.3"]
|
|
45
|
+
huggingface = ["langchain-huggingface>=0.1"]
|
|
46
|
+
pgvector = ["langchain-postgres>=0.0.12"]
|
|
47
|
+
server = ["fastapi>=0.100", "uvicorn[standard]>=0.20", "python-multipart>=0.0.6"]
|
|
48
|
+
all = ["docpipe-sdk[docling,langextract,openai,google,ollama,pgvector,server]"]
|
|
49
|
+
dev = [
|
|
50
|
+
"pytest>=7.0",
|
|
51
|
+
"pytest-asyncio>=0.21",
|
|
52
|
+
"pytest-cov",
|
|
53
|
+
"ruff",
|
|
54
|
+
"mypy",
|
|
55
|
+
"httpx",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
[project.scripts]
|
|
59
|
+
docpipe = "docpipe.cli.main:cli"
|
|
60
|
+
|
|
61
|
+
[project.entry-points."docpipe.parsers"]
|
|
62
|
+
docling = "docpipe.parsers.docling_parser:DoclingParser"
|
|
63
|
+
|
|
64
|
+
[project.entry-points."docpipe.extractors"]
|
|
65
|
+
langextract = "docpipe.extractors.langextract_extractor:LangExtractExtractor"
|
|
66
|
+
langchain = "docpipe.extractors.langchain_extractor:LangChainExtractor"
|
|
67
|
+
|
|
68
|
+
[project.urls]
|
|
69
|
+
Homepage = "https://docpipe.vercel.app"
|
|
70
|
+
Repository = "https://github.com/thesunnysinha/docpipe"
|
|
71
|
+
"Bug Tracker" = "https://github.com/thesunnysinha/docpipe/issues"
|
|
72
|
+
Changelog = "https://github.com/thesunnysinha/docpipe/blob/main/CHANGELOG.md"
|
|
73
|
+
|
|
74
|
+
[tool.hatch.build.targets.wheel]
|
|
75
|
+
packages = ["src/docpipe"]
|
|
76
|
+
|
|
77
|
+
[tool.ruff]
|
|
78
|
+
target-version = "py310"
|
|
79
|
+
line-length = 100
|
|
80
|
+
src = ["src"]
|
|
81
|
+
|
|
82
|
+
[tool.ruff.lint]
|
|
83
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
84
|
+
|
|
85
|
+
[tool.mypy]
|
|
86
|
+
python_version = "3.10"
|
|
87
|
+
strict = true
|
|
88
|
+
warn_return_any = true
|
|
89
|
+
warn_unused_configs = true
|
|
90
|
+
|
|
91
|
+
[tool.pytest.ini_options]
|
|
92
|
+
testpaths = ["tests"]
|
|
93
|
+
asyncio_mode = "auto"
|
|
94
|
+
markers = [
|
|
95
|
+
"requires_docling: needs docling installed",
|
|
96
|
+
"requires_langextract: needs langextract installed",
|
|
97
|
+
"requires_langchain: needs langchain provider installed",
|
|
98
|
+
"requires_pgvector: needs pgvector DB available",
|
|
99
|
+
"requires_api_key: needs LLM API key configured",
|
|
100
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Release script for docpipe
|
|
3
|
+
# Usage: ./scripts/release.sh 0.2.0
|
|
4
|
+
|
|
5
|
+
set -euo pipefail
|
|
6
|
+
|
|
7
|
+
VERSION="${1:?Usage: $0 <version>}"
|
|
8
|
+
|
|
9
|
+
echo "Releasing docpipe v${VERSION}..."
|
|
10
|
+
|
|
11
|
+
# Update version in source
|
|
12
|
+
sed -i.bak "s/__version__ = \".*\"/__version__ = \"${VERSION}\"/" src/docpipe/_version.py
|
|
13
|
+
rm -f src/docpipe/_version.py.bak
|
|
14
|
+
|
|
15
|
+
# Update version in pyproject.toml
|
|
16
|
+
sed -i.bak "s/^version = \".*\"/version = \"${VERSION}\"/" pyproject.toml
|
|
17
|
+
rm -f pyproject.toml.bak
|
|
18
|
+
|
|
19
|
+
# Stage changes
|
|
20
|
+
git add src/docpipe/_version.py pyproject.toml
|
|
21
|
+
|
|
22
|
+
# Commit
|
|
23
|
+
git commit -m "release: v${VERSION}"
|
|
24
|
+
|
|
25
|
+
# Tag
|
|
26
|
+
git tag -a "v${VERSION}" -m "Release v${VERSION}"
|
|
27
|
+
|
|
28
|
+
echo ""
|
|
29
|
+
echo "Done! To publish:"
|
|
30
|
+
echo " git push origin main --tags"
|
|
31
|
+
echo ""
|
|
32
|
+
echo "GitHub Actions will automatically publish to PyPI."
|