docpipe-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. docpipe_sdk-0.1.0/.github/dependabot.yml +35 -0
  2. docpipe_sdk-0.1.0/.github/workflows/ci.yml +35 -0
  3. docpipe_sdk-0.1.0/.github/workflows/publish.yml +48 -0
  4. docpipe_sdk-0.1.0/.gitignore +18 -0
  5. docpipe_sdk-0.1.0/CHANGELOG.md +27 -0
  6. docpipe_sdk-0.1.0/Dockerfile +19 -0
  7. docpipe_sdk-0.1.0/LICENSE +21 -0
  8. docpipe_sdk-0.1.0/PKG-INFO +170 -0
  9. docpipe_sdk-0.1.0/README.md +102 -0
  10. docpipe_sdk-0.1.0/docpipe.example.yaml +29 -0
  11. docpipe_sdk-0.1.0/pyproject.toml +100 -0
  12. docpipe_sdk-0.1.0/scripts/release.sh +32 -0
  13. docpipe_sdk-0.1.0/site/index.html +586 -0
  14. docpipe_sdk-0.1.0/site/vercel.json +15 -0
  15. docpipe_sdk-0.1.0/src/docpipe/__init__.py +150 -0
  16. docpipe_sdk-0.1.0/src/docpipe/_version.py +1 -0
  17. docpipe_sdk-0.1.0/src/docpipe/cli/__init__.py +0 -0
  18. docpipe_sdk-0.1.0/src/docpipe/cli/main.py +308 -0
  19. docpipe_sdk-0.1.0/src/docpipe/config/__init__.py +0 -0
  20. docpipe_sdk-0.1.0/src/docpipe/config/loader.py +54 -0
  21. docpipe_sdk-0.1.0/src/docpipe/config/settings.py +41 -0
  22. docpipe_sdk-0.1.0/src/docpipe/core/__init__.py +0 -0
  23. docpipe_sdk-0.1.0/src/docpipe/core/errors.py +41 -0
  24. docpipe_sdk-0.1.0/src/docpipe/core/extractor.py +37 -0
  25. docpipe_sdk-0.1.0/src/docpipe/core/parser.py +36 -0
  26. docpipe_sdk-0.1.0/src/docpipe/core/pipeline.py +137 -0
  27. docpipe_sdk-0.1.0/src/docpipe/core/types.py +106 -0
  28. docpipe_sdk-0.1.0/src/docpipe/extractors/__init__.py +0 -0
  29. docpipe_sdk-0.1.0/src/docpipe/extractors/langchain_extractor.py +164 -0
  30. docpipe_sdk-0.1.0/src/docpipe/extractors/langextract_extractor.py +106 -0
  31. docpipe_sdk-0.1.0/src/docpipe/ingestion/__init__.py +0 -0
  32. docpipe_sdk-0.1.0/src/docpipe/ingestion/pipeline.py +206 -0
  33. docpipe_sdk-0.1.0/src/docpipe/parsers/__init__.py +0 -0
  34. docpipe_sdk-0.1.0/src/docpipe/parsers/docling_parser.py +136 -0
  35. docpipe_sdk-0.1.0/src/docpipe/py.typed +0 -0
  36. docpipe_sdk-0.1.0/src/docpipe/registry/__init__.py +0 -0
  37. docpipe_sdk-0.1.0/src/docpipe/registry/registry.py +120 -0
  38. docpipe_sdk-0.1.0/src/docpipe/server/__init__.py +0 -0
  39. docpipe_sdk-0.1.0/src/docpipe/server/app.py +239 -0
  40. docpipe_sdk-0.1.0/tests/__init__.py +0 -0
  41. docpipe_sdk-0.1.0/tests/conftest.py +117 -0
  42. docpipe_sdk-0.1.0/tests/integration/__init__.py +0 -0
  43. docpipe_sdk-0.1.0/tests/unit/__init__.py +0 -0
  44. docpipe_sdk-0.1.0/tests/unit/test_config.py +54 -0
  45. docpipe_sdk-0.1.0/tests/unit/test_ingestion.py +77 -0
  46. docpipe_sdk-0.1.0/tests/unit/test_pipeline.py +63 -0
  47. docpipe_sdk-0.1.0/tests/unit/test_registry.py +95 -0
  48. docpipe_sdk-0.1.0/tests/unit/test_types.py +128 -0
@@ -0,0 +1,35 @@
1
+ version: 2
2
+
3
+ updates:
4
+ # Python dependencies
5
+ - package-ecosystem: pip
6
+ directory: "/"
7
+ schedule:
8
+ interval: weekly
9
+ day: monday
10
+ open-pull-requests-limit: 10
11
+ labels:
12
+ - "dependencies"
13
+ - "python"
14
+ groups:
15
+ langchain:
16
+ patterns:
17
+ - "langchain-*"
18
+ update-types:
19
+ - "minor"
20
+ - "patch"
21
+ docling:
22
+ patterns:
23
+ - "docling*"
24
+ langextract:
25
+ patterns:
26
+ - "langextract*"
27
+
28
+ # GitHub Actions
29
+ - package-ecosystem: github-actions
30
+ directory: "/"
31
+ schedule:
32
+ interval: weekly
33
+ labels:
34
+ - "dependencies"
35
+ - "ci"
@@ -0,0 +1,35 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v6
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v6
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: pip install -e ".[dev]"
26
+
27
+ - name: Lint with ruff
28
+ run: ruff check src/
29
+
30
+ - name: Run unit tests
31
+ run: pytest tests/unit/ -v --tb=short
32
+
33
+ - name: Check types with mypy
34
+ run: mypy src/docpipe/ --ignore-missing-imports
35
+ continue-on-error: true
@@ -0,0 +1,48 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ permissions:
9
+ id-token: write
10
+
11
+ jobs:
12
+ build:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v6
16
+
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v6
19
+ with:
20
+ python-version: "3.12"
21
+
22
+ - name: Install build tools
23
+ run: pip install build
24
+
25
+ - name: Build package
26
+ run: python -m build
27
+
28
+ - name: Upload artifacts
29
+ uses: actions/upload-artifact@v7
30
+ with:
31
+ name: dist
32
+ path: dist/
33
+
34
+ publish:
35
+ needs: build
36
+ runs-on: ubuntu-latest
37
+ environment: pypi
38
+ permissions:
39
+ id-token: write
40
+ steps:
41
+ - name: Download artifacts
42
+ uses: actions/download-artifact@v8
43
+ with:
44
+ name: dist
45
+ path: dist/
46
+
47
+ - name: Publish to PyPI
48
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,18 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .venv/
10
+ venv/
11
+ .env
12
+ *.so
13
+ .mypy_cache/
14
+ .pytest_cache/
15
+ .ruff_cache/
16
+ htmlcov/
17
+ .coverage
18
+ *.log
@@ -0,0 +1,27 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.0] - 2026-04-04
11
+
12
+ ### Added
13
+
14
+ - Core pipeline architecture with Protocol-based parser and extractor interfaces
15
+ - Docling parser adapter for document parsing (PDF, DOCX, images, audio, video)
16
+ - LangExtract extractor adapter for LLM-based structured extraction
17
+ - LangChain extractor adapter using `with_structured_output()`
18
+ - Ingestion pipeline with LangChain text splitters, embeddings, and PGVector
19
+ - Plugin registry with `importlib.metadata` entry-point auto-discovery
20
+ - Configuration via Pydantic Settings (env vars + YAML files)
21
+ - CLI commands: `parse`, `extract`, `run`, `ingest`, `search`, `serve`, `plugins`, `config`
22
+ - FastAPI server with REST endpoints for all pipeline operations
23
+ - Dockerfile for containerized deployment
24
+ - 34 unit tests with mock parser/extractor
25
+
26
+ [Unreleased]: https://github.com/thesunnysinha/docpipe/compare/v0.1.0...HEAD
27
+ [0.1.0]: https://github.com/thesunnysinha/docpipe/releases/tag/v0.1.0
@@ -0,0 +1,19 @@
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies for document processing
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ libgl1 \
8
+ libglib2.0-0 \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY pyproject.toml README.md LICENSE ./
12
+ COPY src/ ./src/
13
+
14
+ RUN pip install --no-cache-dir ".[all,server]"
15
+
16
+ ENTRYPOINT ["docpipe"]
17
+ CMD ["serve", "--host", "0.0.0.0", "--port", "8000"]
18
+
19
+ EXPOSE 8000
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sunny Sinha
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,170 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpipe-sdk
3
+ Version: 0.1.0
4
+ Summary: Unified document parsing, structured extraction, and vector ingestion pipeline
5
+ Project-URL: Homepage, https://docpipe.vercel.app
6
+ Project-URL: Repository, https://github.com/thesunnysinha/docpipe
7
+ Project-URL: Bug Tracker, https://github.com/thesunnysinha/docpipe/issues
8
+ Project-URL: Changelog, https://github.com/thesunnysinha/docpipe/blob/main/CHANGELOG.md
9
+ Author-email: Sunny Sinha <thesunnysinha@gmail.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: docling,document,extraction,ingestion,langchain,langextract,llm,parsing,pipeline,rag,vector
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Classifier: Topic :: Text Processing
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.10
26
+ Requires-Dist: click>=8.0
27
+ Requires-Dist: langchain-core>=0.3
28
+ Requires-Dist: langchain-text-splitters>=0.3
29
+ Requires-Dist: pydantic-settings>=2.0
30
+ Requires-Dist: pydantic>=2.0
31
+ Requires-Dist: pyyaml>=6.0
32
+ Provides-Extra: all
33
+ Requires-Dist: docling>=2.0; extra == 'all'
34
+ Requires-Dist: fastapi>=0.100; extra == 'all'
35
+ Requires-Dist: langchain-google-genai>=2.0; extra == 'all'
36
+ Requires-Dist: langchain-ollama>=0.3; extra == 'all'
37
+ Requires-Dist: langchain-openai>=0.3; extra == 'all'
38
+ Requires-Dist: langchain-postgres>=0.0.12; extra == 'all'
39
+ Requires-Dist: langextract>=0.1; extra == 'all'
40
+ Requires-Dist: python-multipart>=0.0.6; extra == 'all'
41
+ Requires-Dist: uvicorn[standard]>=0.20; extra == 'all'
42
+ Provides-Extra: dev
43
+ Requires-Dist: httpx; extra == 'dev'
44
+ Requires-Dist: mypy; extra == 'dev'
45
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
46
+ Requires-Dist: pytest-cov; extra == 'dev'
47
+ Requires-Dist: pytest>=7.0; extra == 'dev'
48
+ Requires-Dist: ruff; extra == 'dev'
49
+ Provides-Extra: docling
50
+ Requires-Dist: docling>=2.0; extra == 'docling'
51
+ Provides-Extra: google
52
+ Requires-Dist: langchain-google-genai>=2.0; extra == 'google'
53
+ Provides-Extra: huggingface
54
+ Requires-Dist: langchain-huggingface>=0.1; extra == 'huggingface'
55
+ Provides-Extra: langextract
56
+ Requires-Dist: langextract>=0.1; extra == 'langextract'
57
+ Provides-Extra: ollama
58
+ Requires-Dist: langchain-ollama>=0.3; extra == 'ollama'
59
+ Provides-Extra: openai
60
+ Requires-Dist: langchain-openai>=0.3; extra == 'openai'
61
+ Provides-Extra: pgvector
62
+ Requires-Dist: langchain-postgres>=0.0.12; extra == 'pgvector'
63
+ Provides-Extra: server
64
+ Requires-Dist: fastapi>=0.100; extra == 'server'
65
+ Requires-Dist: python-multipart>=0.0.6; extra == 'server'
66
+ Requires-Dist: uvicorn[standard]>=0.20; extra == 'server'
67
+ Description-Content-Type: text/markdown
68
+
69
+ # docpipe
70
+
71
+ Unified document parsing, structured extraction, and vector ingestion pipeline.
72
+
73
+ ## Overview
74
+
75
+ docpipe connects document parsing (Docling), LLM-based structured extraction (LangExtract + LangChain), and vector ingestion (pgvector via LangChain) into a single composable pipeline.
76
+
77
+ **Three independent pipelines, composable together:**
78
+
79
+ 1. **Parse**: Unstructured docs → parsed text/markdown (Docling)
80
+ 2. **Extract**: Text → structured entities via LLM (LangExtract or LangChain)
81
+ 3. **Ingest**: Parsed chunks → embeddings → your vector DB (LangChain + pgvector)
82
+
83
+ ## Install
84
+
85
+ ```bash
86
+ # Core only
87
+ pip install docpipe
88
+
89
+ # With all backends
90
+ pip install "docpipe[all]"
91
+
92
+ # Pick what you need
93
+ pip install "docpipe[docling]" # Document parsing
94
+ pip install "docpipe[langextract]" # Google LangExtract
95
+ pip install "docpipe[openai]" # OpenAI embeddings + LLM
96
+ pip install "docpipe[pgvector]" # PostgreSQL vector store
97
+ pip install "docpipe[server]" # FastAPI server
98
+ ```
99
+
100
+ ## Quick Start
101
+
102
+ ### Python API
103
+
104
+ ```python
105
+ import docpipe
106
+
107
+ # Parse a document
108
+ doc = docpipe.parse("invoice.pdf")
109
+ print(doc.markdown)
110
+
111
+ # Extract structured data
112
+ schema = docpipe.ExtractionSchema(
113
+ description="Extract invoice line items with amounts",
114
+ model_id="gemini-2.5-flash",
115
+ )
116
+ results = docpipe.extract(doc.text, schema)
117
+
118
+ # Full pipeline
119
+ result = docpipe.run("invoice.pdf", schema)
120
+
121
+ # Ingest into your vector DB
122
+ config = docpipe.IngestionConfig(
123
+ connection_string="postgresql://user:pass@localhost:5432/mydb",
124
+ table_name="invoices",
125
+ embedding_provider="openai",
126
+ embedding_model="text-embedding-3-small",
127
+ )
128
+ docpipe.ingest("invoice.pdf", config=config)
129
+ ```
130
+
131
+ ### CLI
132
+
133
+ ```bash
134
+ docpipe parse invoice.pdf --format markdown
135
+ docpipe extract "John Doe, age 30" --schema schema.yaml --model gemini-2.5-flash
136
+ docpipe run invoice.pdf --schema schema.yaml --model gemini-2.5-flash
137
+ docpipe ingest invoice.pdf --db "postgresql://..." --table invoices \
138
+ --embedding-provider openai --embedding-model text-embedding-3-small
139
+ docpipe search "total amount" --db "postgresql://..." --table invoices \
140
+ --embedding-provider openai --embedding-model text-embedding-3-small
141
+ docpipe serve
142
+ docpipe plugins list
143
+ ```
144
+
145
+ ### Docker
146
+
147
+ ```bash
148
+ # API server
149
+ docker run -p 8000:8000 --env-file .env docpipe
150
+
151
+ # CLI
152
+ docker run -v ./data:/data docpipe parse /data/invoice.pdf
153
+ ```
154
+
155
+ ## Plugin System
156
+
157
+ Third-party packages can register as plugins via entry points:
158
+
159
+ ```toml
160
+ # In your package's pyproject.toml
161
+ [project.entry-points."docpipe.parsers"]
162
+ my_parser = "my_package:MyParser"
163
+
164
+ [project.entry-points."docpipe.extractors"]
165
+ my_extractor = "my_package:MyExtractor"
166
+ ```
167
+
168
+ ## License
169
+
170
+ MIT
@@ -0,0 +1,102 @@
1
+ # docpipe
2
+
3
+ Unified document parsing, structured extraction, and vector ingestion pipeline.
4
+
5
+ ## Overview
6
+
7
+ docpipe connects document parsing (Docling), LLM-based structured extraction (LangExtract + LangChain), and vector ingestion (pgvector via LangChain) into a single composable pipeline.
8
+
9
+ **Three independent pipelines, composable together:**
10
+
11
+ 1. **Parse**: Unstructured docs → parsed text/markdown (Docling)
12
+ 2. **Extract**: Text → structured entities via LLM (LangExtract or LangChain)
13
+ 3. **Ingest**: Parsed chunks → embeddings → your vector DB (LangChain + pgvector)
14
+
15
+ ## Install
16
+
17
+ ```bash
18
+ # Core only
19
+ pip install docpipe
20
+
21
+ # With all backends
22
+ pip install "docpipe[all]"
23
+
24
+ # Pick what you need
25
+ pip install "docpipe[docling]" # Document parsing
26
+ pip install "docpipe[langextract]" # Google LangExtract
27
+ pip install "docpipe[openai]" # OpenAI embeddings + LLM
28
+ pip install "docpipe[pgvector]" # PostgreSQL vector store
29
+ pip install "docpipe[server]" # FastAPI server
30
+ ```
31
+
32
+ ## Quick Start
33
+
34
+ ### Python API
35
+
36
+ ```python
37
+ import docpipe
38
+
39
+ # Parse a document
40
+ doc = docpipe.parse("invoice.pdf")
41
+ print(doc.markdown)
42
+
43
+ # Extract structured data
44
+ schema = docpipe.ExtractionSchema(
45
+ description="Extract invoice line items with amounts",
46
+ model_id="gemini-2.5-flash",
47
+ )
48
+ results = docpipe.extract(doc.text, schema)
49
+
50
+ # Full pipeline
51
+ result = docpipe.run("invoice.pdf", schema)
52
+
53
+ # Ingest into your vector DB
54
+ config = docpipe.IngestionConfig(
55
+ connection_string="postgresql://user:pass@localhost:5432/mydb",
56
+ table_name="invoices",
57
+ embedding_provider="openai",
58
+ embedding_model="text-embedding-3-small",
59
+ )
60
+ docpipe.ingest("invoice.pdf", config=config)
61
+ ```
62
+
63
+ ### CLI
64
+
65
+ ```bash
66
+ docpipe parse invoice.pdf --format markdown
67
+ docpipe extract "John Doe, age 30" --schema schema.yaml --model gemini-2.5-flash
68
+ docpipe run invoice.pdf --schema schema.yaml --model gemini-2.5-flash
69
+ docpipe ingest invoice.pdf --db "postgresql://..." --table invoices \
70
+ --embedding-provider openai --embedding-model text-embedding-3-small
71
+ docpipe search "total amount" --db "postgresql://..." --table invoices \
72
+ --embedding-provider openai --embedding-model text-embedding-3-small
73
+ docpipe serve
74
+ docpipe plugins list
75
+ ```
76
+
77
+ ### Docker
78
+
79
+ ```bash
80
+ # API server
81
+ docker run -p 8000:8000 --env-file .env docpipe
82
+
83
+ # CLI
84
+ docker run -v ./data:/data docpipe parse /data/invoice.pdf
85
+ ```
86
+
87
+ ## Plugin System
88
+
89
+ Third-party packages can register as plugins via entry points:
90
+
91
+ ```toml
92
+ # In your package's pyproject.toml
93
+ [project.entry-points."docpipe.parsers"]
94
+ my_parser = "my_package:MyParser"
95
+
96
+ [project.entry-points."docpipe.extractors"]
97
+ my_extractor = "my_package:MyExtractor"
98
+ ```
99
+
100
+ ## License
101
+
102
+ MIT
@@ -0,0 +1,29 @@
1
+ # docpipe configuration
2
+ # Copy to docpipe.yaml and customize
3
+
4
+ # Parser settings
5
+ default_parser: docling
6
+ parser_options: {}
7
+
8
+ # Extractor settings
9
+ default_extractor: langextract
10
+ extractor_options: {}
11
+
12
+ # Ingestion settings (provide your own DB connection)
13
+ # db_connection_string: postgresql://user:pass@host:5432/dbname
14
+ # db_table_name: docpipe_documents
15
+ # embedding_provider: openai
16
+ # embedding_model: text-embedding-3-small
17
+ # chunk_size: 1000
18
+ # chunk_overlap: 200
19
+ # ingest_mode: both
20
+
21
+ # Server settings
22
+ server_host: "0.0.0.0"
23
+ server_port: 8000
24
+
25
+ # Pipeline settings
26
+ max_concurrency: 4
27
+
28
+ # Logging
29
+ log_level: INFO
@@ -0,0 +1,100 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.26"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "docpipe-sdk"
7
+ version = "0.1.0"
8
+ description = "Unified document parsing, structured extraction, and vector ingestion pipeline"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ license-files = ["LICENSE"]
12
+ requires-python = ">=3.10"
13
+ authors = [{ name = "Sunny Sinha", email = "thesunnysinha@gmail.com" }]
14
+ keywords = ["document", "parsing", "extraction", "llm", "pipeline", "vector", "ingestion", "rag", "docling", "langextract", "langchain"]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
25
+ "Topic :: Text Processing",
26
+ "Topic :: Software Development :: Libraries :: Python Modules",
27
+ "Typing :: Typed",
28
+ ]
29
+
30
+ dependencies = [
31
+ "pydantic>=2.0",
32
+ "pydantic-settings>=2.0",
33
+ "pyyaml>=6.0",
34
+ "click>=8.0",
35
+ "langchain-core>=0.3",
36
+ "langchain-text-splitters>=0.3",
37
+ ]
38
+
39
+ [project.optional-dependencies]
40
+ docling = ["docling>=2.0"]
41
+ langextract = ["langextract>=0.1"]
42
+ openai = ["langchain-openai>=0.3"]
43
+ google = ["langchain-google-genai>=2.0"]
44
+ ollama = ["langchain-ollama>=0.3"]
45
+ huggingface = ["langchain-huggingface>=0.1"]
46
+ pgvector = ["langchain-postgres>=0.0.12"]
47
+ server = ["fastapi>=0.100", "uvicorn[standard]>=0.20", "python-multipart>=0.0.6"]
48
+ all = ["docpipe-sdk[docling,langextract,openai,google,ollama,pgvector,server]"]
49
+ dev = [
50
+ "pytest>=7.0",
51
+ "pytest-asyncio>=0.21",
52
+ "pytest-cov",
53
+ "ruff",
54
+ "mypy",
55
+ "httpx",
56
+ ]
57
+
58
+ [project.scripts]
59
+ docpipe = "docpipe.cli.main:cli"
60
+
61
+ [project.entry-points."docpipe.parsers"]
62
+ docling = "docpipe.parsers.docling_parser:DoclingParser"
63
+
64
+ [project.entry-points."docpipe.extractors"]
65
+ langextract = "docpipe.extractors.langextract_extractor:LangExtractExtractor"
66
+ langchain = "docpipe.extractors.langchain_extractor:LangChainExtractor"
67
+
68
+ [project.urls]
69
+ Homepage = "https://docpipe.vercel.app"
70
+ Repository = "https://github.com/thesunnysinha/docpipe"
71
+ "Bug Tracker" = "https://github.com/thesunnysinha/docpipe/issues"
72
+ Changelog = "https://github.com/thesunnysinha/docpipe/blob/main/CHANGELOG.md"
73
+
74
+ [tool.hatch.build.targets.wheel]
75
+ packages = ["src/docpipe"]
76
+
77
+ [tool.ruff]
78
+ target-version = "py310"
79
+ line-length = 100
80
+ src = ["src"]
81
+
82
+ [tool.ruff.lint]
83
+ select = ["E", "F", "I", "UP", "B", "SIM"]
84
+
85
+ [tool.mypy]
86
+ python_version = "3.10"
87
+ strict = true
88
+ warn_return_any = true
89
+ warn_unused_configs = true
90
+
91
+ [tool.pytest.ini_options]
92
+ testpaths = ["tests"]
93
+ asyncio_mode = "auto"
94
+ markers = [
95
+ "requires_docling: needs docling installed",
96
+ "requires_langextract: needs langextract installed",
97
+ "requires_langchain: needs langchain provider installed",
98
+ "requires_pgvector: needs pgvector DB available",
99
+ "requires_api_key: needs LLM API key configured",
100
+ ]
@@ -0,0 +1,32 @@
1
+ #!/bin/bash
2
+ # Release script for docpipe
3
+ # Usage: ./scripts/release.sh 0.2.0
4
+
5
+ set -euo pipefail
6
+
7
+ VERSION="${1:?Usage: $0 <version>}"
8
+
9
+ echo "Releasing docpipe v${VERSION}..."
10
+
11
+ # Update version in source
12
+ sed -i.bak "s/__version__ = \".*\"/__version__ = \"${VERSION}\"/" src/docpipe/_version.py
13
+ rm -f src/docpipe/_version.py.bak
14
+
15
+ # Update version in pyproject.toml
16
+ sed -i.bak "s/^version = \".*\"/version = \"${VERSION}\"/" pyproject.toml
17
+ rm -f pyproject.toml.bak
18
+
19
+ # Stage changes
20
+ git add src/docpipe/_version.py pyproject.toml
21
+
22
+ # Commit
23
+ git commit -m "release: v${VERSION}"
24
+
25
+ # Tag
26
+ git tag -a "v${VERSION}" -m "Release v${VERSION}"
27
+
28
+ echo ""
29
+ echo "Done! To publish:"
30
+ echo " git push origin main --tags"
31
+ echo ""
32
+ echo "GitHub Actions will automatically publish to PyPI."