docpipe-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpipe/__init__.py +150 -0
- docpipe/_version.py +1 -0
- docpipe/cli/__init__.py +0 -0
- docpipe/cli/main.py +308 -0
- docpipe/config/__init__.py +0 -0
- docpipe/config/loader.py +54 -0
- docpipe/config/settings.py +41 -0
- docpipe/core/__init__.py +0 -0
- docpipe/core/errors.py +41 -0
- docpipe/core/extractor.py +37 -0
- docpipe/core/parser.py +36 -0
- docpipe/core/pipeline.py +137 -0
- docpipe/core/types.py +106 -0
- docpipe/extractors/__init__.py +0 -0
- docpipe/extractors/langchain_extractor.py +164 -0
- docpipe/extractors/langextract_extractor.py +106 -0
- docpipe/ingestion/__init__.py +0 -0
- docpipe/ingestion/pipeline.py +206 -0
- docpipe/parsers/__init__.py +0 -0
- docpipe/parsers/docling_parser.py +136 -0
- docpipe/py.typed +0 -0
- docpipe/registry/__init__.py +0 -0
- docpipe/registry/registry.py +120 -0
- docpipe/server/__init__.py +0 -0
- docpipe/server/app.py +239 -0
- docpipe_sdk-0.1.0.dist-info/METADATA +170 -0
- docpipe_sdk-0.1.0.dist-info/RECORD +30 -0
- docpipe_sdk-0.1.0.dist-info/WHEEL +4 -0
- docpipe_sdk-0.1.0.dist-info/entry_points.txt +9 -0
- docpipe_sdk-0.1.0.dist-info/licenses/LICENSE +21 -0
docpipe/__init__.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""docpipe - Unified document parsing, structured extraction, and vector ingestion pipeline."""
|
|
2
|
+
|
|
3
|
+
from docpipe._version import __version__
|
|
4
|
+
from docpipe.core.errors import (
|
|
5
|
+
ConfigurationError,
|
|
6
|
+
DocpipeError,
|
|
7
|
+
ExtractionError,
|
|
8
|
+
ExtractorNotFoundError,
|
|
9
|
+
ExtractorNotInstalledError,
|
|
10
|
+
IngestionError,
|
|
11
|
+
ParseError,
|
|
12
|
+
ParserNotFoundError,
|
|
13
|
+
ParserNotInstalledError,
|
|
14
|
+
UnsupportedFormatError,
|
|
15
|
+
)
|
|
16
|
+
from docpipe.core.extractor import BaseExtractor
|
|
17
|
+
from docpipe.core.parser import BaseParser
|
|
18
|
+
from docpipe.core.pipeline import Pipeline
|
|
19
|
+
from docpipe.core.types import (
|
|
20
|
+
DocumentFormat,
|
|
21
|
+
ExtractionResult,
|
|
22
|
+
ExtractionSchema,
|
|
23
|
+
IngestionConfig,
|
|
24
|
+
IngestionResult,
|
|
25
|
+
PageContent,
|
|
26
|
+
ParsedDocument,
|
|
27
|
+
PipelineResult,
|
|
28
|
+
SourceSpan,
|
|
29
|
+
)
|
|
30
|
+
from docpipe.registry.registry import PluginRegistry
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _register_builtins() -> None:
|
|
34
|
+
"""Register built-in parsers and extractors if their dependencies are available."""
|
|
35
|
+
registry = PluginRegistry.get()
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
from docpipe.parsers.docling_parser import DoclingParser
|
|
39
|
+
|
|
40
|
+
registry.register_parser("docling", DoclingParser)
|
|
41
|
+
except ImportError:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
from docpipe.extractors.langextract_extractor import LangExtractExtractor
|
|
46
|
+
|
|
47
|
+
registry.register_extractor("langextract", LangExtractExtractor)
|
|
48
|
+
except ImportError:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
from docpipe.extractors.langchain_extractor import LangChainExtractor
|
|
53
|
+
|
|
54
|
+
registry.register_extractor("langchain", LangChainExtractor)
|
|
55
|
+
except ImportError:
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
_register_builtins()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# --- Convenience functions ---
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def parse(source: str, *, parser: str = "docling", **kwargs: object) -> ParsedDocument:
|
|
66
|
+
"""Parse a document using the specified parser."""
|
|
67
|
+
p = PluginRegistry.get().get_parser(parser, **kwargs)
|
|
68
|
+
return p.parse(source)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def extract(
|
|
72
|
+
text: str,
|
|
73
|
+
schema: ExtractionSchema,
|
|
74
|
+
*,
|
|
75
|
+
extractor: str = "langextract",
|
|
76
|
+
**kwargs: object,
|
|
77
|
+
) -> list[ExtractionResult]:
|
|
78
|
+
"""Extract structured data using the specified extractor."""
|
|
79
|
+
e = PluginRegistry.get().get_extractor(extractor, **kwargs)
|
|
80
|
+
return e.extract(text, schema)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def run(
|
|
84
|
+
source: str,
|
|
85
|
+
schema: ExtractionSchema,
|
|
86
|
+
*,
|
|
87
|
+
parser: str = "docling",
|
|
88
|
+
extractor: str = "langextract",
|
|
89
|
+
ingestion_config: IngestionConfig | None = None,
|
|
90
|
+
) -> PipelineResult:
|
|
91
|
+
"""Run the full pipeline: parse + extract, optionally ingest."""
|
|
92
|
+
pipeline = Pipeline(
|
|
93
|
+
parser=parser,
|
|
94
|
+
extractor=extractor,
|
|
95
|
+
ingestion_config=ingestion_config,
|
|
96
|
+
)
|
|
97
|
+
return pipeline.run(source, schema)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def ingest(
|
|
101
|
+
source: str,
|
|
102
|
+
*,
|
|
103
|
+
config: IngestionConfig,
|
|
104
|
+
parser: str = "docling",
|
|
105
|
+
) -> IngestionResult:
|
|
106
|
+
"""Parse a document and ingest it into a vector store."""
|
|
107
|
+
from docpipe.ingestion.pipeline import IngestionPipeline
|
|
108
|
+
|
|
109
|
+
p = PluginRegistry.get().get_parser(parser)
|
|
110
|
+
parsed = p.parse(source)
|
|
111
|
+
ingestion = IngestionPipeline(config)
|
|
112
|
+
return ingestion.ingest(parsed)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
__all__ = [
|
|
116
|
+
"__version__",
|
|
117
|
+
# Core types
|
|
118
|
+
"DocumentFormat",
|
|
119
|
+
"ExtractionResult",
|
|
120
|
+
"ExtractionSchema",
|
|
121
|
+
"IngestionConfig",
|
|
122
|
+
"IngestionResult",
|
|
123
|
+
"PageContent",
|
|
124
|
+
"ParsedDocument",
|
|
125
|
+
"PipelineResult",
|
|
126
|
+
"SourceSpan",
|
|
127
|
+
# Protocols
|
|
128
|
+
"BaseExtractor",
|
|
129
|
+
"BaseParser",
|
|
130
|
+
# Pipeline
|
|
131
|
+
"Pipeline",
|
|
132
|
+
# Registry
|
|
133
|
+
"PluginRegistry",
|
|
134
|
+
# Errors
|
|
135
|
+
"ConfigurationError",
|
|
136
|
+
"DocpipeError",
|
|
137
|
+
"ExtractionError",
|
|
138
|
+
"ExtractorNotFoundError",
|
|
139
|
+
"ExtractorNotInstalledError",
|
|
140
|
+
"IngestionError",
|
|
141
|
+
"ParseError",
|
|
142
|
+
"ParserNotFoundError",
|
|
143
|
+
"ParserNotInstalledError",
|
|
144
|
+
"UnsupportedFormatError",
|
|
145
|
+
# Convenience functions
|
|
146
|
+
"extract",
|
|
147
|
+
"ingest",
|
|
148
|
+
"parse",
|
|
149
|
+
"run",
|
|
150
|
+
]
|
docpipe/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
docpipe/cli/__init__.py
ADDED
|
File without changes
|
docpipe/cli/main.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""CLI interface for docpipe."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import click
|
|
11
|
+
|
|
12
|
+
from docpipe._version import __version__
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.group()
|
|
16
|
+
@click.version_option(__version__, prog_name="docpipe")
|
|
17
|
+
@click.option("--log-level", default="INFO", help="Logging level")
|
|
18
|
+
def cli(log_level: str) -> None:
|
|
19
|
+
"""docpipe - Unified document parsing, extraction, and ingestion pipeline."""
|
|
20
|
+
logging.basicConfig(
|
|
21
|
+
level=getattr(logging, log_level.upper(), logging.INFO),
|
|
22
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@cli.command()
|
|
27
|
+
@click.argument("file")
|
|
28
|
+
@click.option("--parser", default="docling", help="Parser to use")
|
|
29
|
+
@click.option(
|
|
30
|
+
"--format", "output_format", default="markdown",
|
|
31
|
+
type=click.Choice(["markdown", "text", "json"]),
|
|
32
|
+
)
|
|
33
|
+
@click.option("--output", "-o", default=None, help="Output file (default: stdout)")
|
|
34
|
+
def parse(file: str, parser: str, output_format: str, output: str | None) -> None:
|
|
35
|
+
"""Parse a document into structured text."""
|
|
36
|
+
from docpipe.registry.registry import PluginRegistry
|
|
37
|
+
|
|
38
|
+
p = PluginRegistry.get().get_parser(parser)
|
|
39
|
+
result = p.parse(file)
|
|
40
|
+
|
|
41
|
+
if output_format == "markdown":
|
|
42
|
+
content = result.markdown or result.text
|
|
43
|
+
elif output_format == "text":
|
|
44
|
+
content = result.text
|
|
45
|
+
else:
|
|
46
|
+
content = result.model_dump_json(indent=2)
|
|
47
|
+
|
|
48
|
+
if output:
|
|
49
|
+
Path(output).write_text(content)
|
|
50
|
+
click.echo(f"Output written to {output}")
|
|
51
|
+
else:
|
|
52
|
+
click.echo(content)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@cli.command()
|
|
56
|
+
@click.argument("text_or_file")
|
|
57
|
+
@click.option("--schema", "schema_file", required=True, help="Schema YAML file")
|
|
58
|
+
@click.option("--extractor", default="langextract", help="Extractor to use")
|
|
59
|
+
@click.option("--model", "model_id", required=True, help="LLM model ID")
|
|
60
|
+
@click.option("--output", "-o", default=None, help="Output file (default: stdout)")
|
|
61
|
+
def extract(
|
|
62
|
+
text_or_file: str,
|
|
63
|
+
schema_file: str,
|
|
64
|
+
extractor: str,
|
|
65
|
+
model_id: str,
|
|
66
|
+
output: str | None,
|
|
67
|
+
) -> None:
|
|
68
|
+
"""Extract structured data from text or a file."""
|
|
69
|
+
import yaml
|
|
70
|
+
|
|
71
|
+
from docpipe.core.types import ExtractionSchema
|
|
72
|
+
from docpipe.registry.registry import PluginRegistry
|
|
73
|
+
|
|
74
|
+
# Load schema
|
|
75
|
+
with open(schema_file) as f:
|
|
76
|
+
schema_data = yaml.safe_load(f)
|
|
77
|
+
schema_data["model_id"] = model_id
|
|
78
|
+
schema = ExtractionSchema(**schema_data)
|
|
79
|
+
|
|
80
|
+
# Determine if input is a file or text
|
|
81
|
+
text = text_or_file
|
|
82
|
+
if Path(text_or_file).exists():
|
|
83
|
+
text = Path(text_or_file).read_text()
|
|
84
|
+
|
|
85
|
+
e = PluginRegistry.get().get_extractor(extractor)
|
|
86
|
+
results = e.extract(text, schema)
|
|
87
|
+
|
|
88
|
+
output_data = [r.model_dump() for r in results]
|
|
89
|
+
content = json.dumps(output_data, indent=2, default=str)
|
|
90
|
+
|
|
91
|
+
if output:
|
|
92
|
+
Path(output).write_text(content)
|
|
93
|
+
click.echo(f"Output written to {output}")
|
|
94
|
+
else:
|
|
95
|
+
click.echo(content)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@cli.command("run")
|
|
99
|
+
@click.argument("file")
|
|
100
|
+
@click.option("--schema", "schema_file", required=True, help="Schema YAML file")
|
|
101
|
+
@click.option("--parser", default="docling", help="Parser to use")
|
|
102
|
+
@click.option("--extractor", default="langextract", help="Extractor to use")
|
|
103
|
+
@click.option("--model", "model_id", required=True, help="LLM model ID")
|
|
104
|
+
@click.option("--output", "-o", default=None, help="Output file (default: stdout)")
|
|
105
|
+
def run_pipeline(
|
|
106
|
+
file: str,
|
|
107
|
+
schema_file: str,
|
|
108
|
+
parser: str,
|
|
109
|
+
extractor: str,
|
|
110
|
+
model_id: str,
|
|
111
|
+
output: str | None,
|
|
112
|
+
) -> None:
|
|
113
|
+
"""Run the full pipeline: parse + extract."""
|
|
114
|
+
import yaml
|
|
115
|
+
|
|
116
|
+
from docpipe.core.pipeline import Pipeline
|
|
117
|
+
from docpipe.core.types import ExtractionSchema
|
|
118
|
+
|
|
119
|
+
with open(schema_file) as f:
|
|
120
|
+
schema_data = yaml.safe_load(f)
|
|
121
|
+
schema_data["model_id"] = model_id
|
|
122
|
+
schema = ExtractionSchema(**schema_data)
|
|
123
|
+
|
|
124
|
+
pipeline = Pipeline(parser=parser, extractor=extractor)
|
|
125
|
+
result = pipeline.run(file, schema)
|
|
126
|
+
|
|
127
|
+
content = result.model_dump_json(indent=2)
|
|
128
|
+
|
|
129
|
+
if output:
|
|
130
|
+
Path(output).write_text(content)
|
|
131
|
+
click.echo(f"Output written to {output}")
|
|
132
|
+
else:
|
|
133
|
+
click.echo(content)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@cli.command()
|
|
137
|
+
@click.argument("file")
|
|
138
|
+
@click.option("--db", required=True, help="Database connection string")
|
|
139
|
+
@click.option("--table", required=True, help="Target table name")
|
|
140
|
+
@click.option(
|
|
141
|
+
"--embedding-provider", required=True,
|
|
142
|
+
help="Embedding provider (openai, ollama, huggingface, google)",
|
|
143
|
+
)
|
|
144
|
+
@click.option("--embedding-model", required=True, help="Embedding model name")
|
|
145
|
+
@click.option("--mode", default="both", type=click.Choice(["chunks", "extractions", "both"]))
|
|
146
|
+
@click.option("--chunk-size", default=1000, help="Chunk size for text splitting")
|
|
147
|
+
@click.option("--chunk-overlap", default=200, help="Chunk overlap")
|
|
148
|
+
@click.option("--parser", default="docling", help="Parser to use")
|
|
149
|
+
def ingest(
|
|
150
|
+
file: str,
|
|
151
|
+
db: str,
|
|
152
|
+
table: str,
|
|
153
|
+
embedding_provider: str,
|
|
154
|
+
embedding_model: str,
|
|
155
|
+
mode: str,
|
|
156
|
+
chunk_size: int,
|
|
157
|
+
chunk_overlap: int,
|
|
158
|
+
parser: str,
|
|
159
|
+
) -> None:
|
|
160
|
+
"""Parse a document and ingest into a vector database."""
|
|
161
|
+
from docpipe.core.types import IngestionConfig
|
|
162
|
+
from docpipe.ingestion.pipeline import IngestionPipeline
|
|
163
|
+
from docpipe.registry.registry import PluginRegistry
|
|
164
|
+
|
|
165
|
+
config = IngestionConfig(
|
|
166
|
+
connection_string=db,
|
|
167
|
+
table_name=table,
|
|
168
|
+
embedding_provider=embedding_provider,
|
|
169
|
+
embedding_model=embedding_model,
|
|
170
|
+
chunk_size=chunk_size,
|
|
171
|
+
chunk_overlap=chunk_overlap,
|
|
172
|
+
ingest_mode=mode,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
p = PluginRegistry.get().get_parser(parser)
|
|
176
|
+
parsed = p.parse(file)
|
|
177
|
+
|
|
178
|
+
ingestion = IngestionPipeline(config)
|
|
179
|
+
result = ingestion.ingest(parsed)
|
|
180
|
+
|
|
181
|
+
click.echo(f"Ingested {result.chunks_ingested} chunks into '{result.table_name}'")
|
|
182
|
+
if result.table_created:
|
|
183
|
+
click.echo("Table was created.")
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@cli.command()
|
|
187
|
+
@click.argument("query")
|
|
188
|
+
@click.option("--db", required=True, help="Database connection string")
|
|
189
|
+
@click.option("--table", required=True, help="Table name to search")
|
|
190
|
+
@click.option("--embedding-provider", required=True, help="Embedding provider")
|
|
191
|
+
@click.option("--embedding-model", required=True, help="Embedding model name")
|
|
192
|
+
@click.option("--top-k", default=10, help="Number of results")
|
|
193
|
+
def search(
|
|
194
|
+
query: str,
|
|
195
|
+
db: str,
|
|
196
|
+
table: str,
|
|
197
|
+
embedding_provider: str,
|
|
198
|
+
embedding_model: str,
|
|
199
|
+
top_k: int,
|
|
200
|
+
) -> None:
|
|
201
|
+
"""Search for similar documents in the vector database."""
|
|
202
|
+
from docpipe.core.types import IngestionConfig
|
|
203
|
+
from docpipe.ingestion.pipeline import IngestionPipeline
|
|
204
|
+
|
|
205
|
+
config = IngestionConfig(
|
|
206
|
+
connection_string=db,
|
|
207
|
+
table_name=table,
|
|
208
|
+
embedding_provider=embedding_provider,
|
|
209
|
+
embedding_model=embedding_model,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
ingestion = IngestionPipeline(config)
|
|
213
|
+
results = ingestion.search(query, top_k=top_k)
|
|
214
|
+
|
|
215
|
+
for i, r in enumerate(results, 1):
|
|
216
|
+
click.echo(f"\n--- Result {i} (score: {r['score']:.4f}) ---")
|
|
217
|
+
click.echo(r["content"][:500])
|
|
218
|
+
if r["metadata"]:
|
|
219
|
+
click.echo(f"Metadata: {json.dumps(r['metadata'], default=str)}")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@cli.command()
|
|
223
|
+
@click.option("--host", default="0.0.0.0", help="Server host")
|
|
224
|
+
@click.option("--port", default=8000, help="Server port")
|
|
225
|
+
@click.option("--reload", is_flag=True, help="Enable auto-reload for development")
|
|
226
|
+
def serve(host: str, port: int, reload: bool) -> None:
|
|
227
|
+
"""Start the docpipe API server."""
|
|
228
|
+
try:
|
|
229
|
+
import uvicorn
|
|
230
|
+
except ImportError:
|
|
231
|
+
click.echo("Server requires uvicorn. Install with: pip install docpipe[server]", err=True)
|
|
232
|
+
sys.exit(1)
|
|
233
|
+
|
|
234
|
+
click.echo(f"Starting docpipe server on {host}:{port}")
|
|
235
|
+
uvicorn.run("docpipe.server.app:app", host=host, port=port, reload=reload)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@cli.group()
|
|
239
|
+
def plugins() -> None:
|
|
240
|
+
"""Manage plugins."""
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@plugins.command("list")
|
|
244
|
+
def plugins_list() -> None:
|
|
245
|
+
"""List all registered plugins."""
|
|
246
|
+
from docpipe.registry.registry import PluginRegistry
|
|
247
|
+
|
|
248
|
+
registry = PluginRegistry.get()
|
|
249
|
+
|
|
250
|
+
click.echo("Parsers:")
|
|
251
|
+
for name in registry.list_parsers():
|
|
252
|
+
info = registry.parser_info(name)
|
|
253
|
+
available = "available" if info.get("available") else "not installed"
|
|
254
|
+
click.echo(f" - {name} ({available})")
|
|
255
|
+
|
|
256
|
+
click.echo("\nExtractors:")
|
|
257
|
+
for name in registry.list_extractors():
|
|
258
|
+
info = registry.extractor_info(name)
|
|
259
|
+
available = "available" if info.get("available") else "not installed"
|
|
260
|
+
click.echo(f" - {name} ({available})")
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@cli.group()
|
|
264
|
+
def config() -> None:
|
|
265
|
+
"""Manage configuration."""
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@config.command("init")
|
|
269
|
+
@click.option("--output", "-o", default="docpipe.yaml", help="Output file path")
|
|
270
|
+
def config_init(output: str) -> None:
|
|
271
|
+
"""Generate a template configuration file."""
|
|
272
|
+
template = """\
|
|
273
|
+
# docpipe configuration
|
|
274
|
+
# See https://github.com/thesunnysinha/docpipe for documentation
|
|
275
|
+
|
|
276
|
+
# Parser settings
|
|
277
|
+
default_parser: docling
|
|
278
|
+
parser_options: {}
|
|
279
|
+
|
|
280
|
+
# Extractor settings
|
|
281
|
+
default_extractor: langextract
|
|
282
|
+
extractor_options: {}
|
|
283
|
+
|
|
284
|
+
# Ingestion settings (provide your own DB connection)
|
|
285
|
+
# db_connection_string: postgresql://user:pass@host:5432/dbname
|
|
286
|
+
# db_table_name: docpipe_documents
|
|
287
|
+
# embedding_provider: openai
|
|
288
|
+
# embedding_model: text-embedding-3-small
|
|
289
|
+
# chunk_size: 1000
|
|
290
|
+
# chunk_overlap: 200
|
|
291
|
+
# ingest_mode: both
|
|
292
|
+
|
|
293
|
+
# Server settings
|
|
294
|
+
server_host: "0.0.0.0"
|
|
295
|
+
server_port: 8000
|
|
296
|
+
|
|
297
|
+
# Pipeline settings
|
|
298
|
+
max_concurrency: 4
|
|
299
|
+
|
|
300
|
+
# Logging
|
|
301
|
+
log_level: INFO
|
|
302
|
+
"""
|
|
303
|
+
Path(output).write_text(template)
|
|
304
|
+
click.echo(f"Config template written to {output}")
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
if __name__ == "__main__":
|
|
308
|
+
cli()
|
|
File without changes
|
docpipe/config/loader.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""YAML and environment-based configuration loader."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
from docpipe.config.settings import DocpipeSettings
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
DEFAULT_CONFIG_PATHS = [
|
|
16
|
+
Path("docpipe.yaml"),
|
|
17
|
+
Path("docpipe.yml"),
|
|
18
|
+
Path.home() / ".config" / "docpipe" / "config.yaml",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_config(path: str | Path | None = None) -> DocpipeSettings:
|
|
23
|
+
"""Load configuration from YAML file, env vars, or defaults.
|
|
24
|
+
|
|
25
|
+
Priority (highest to lowest):
|
|
26
|
+
1. Environment variables (DOCPIPE_*)
|
|
27
|
+
2. Explicit config file path
|
|
28
|
+
3. Auto-discovered config files (docpipe.yaml in cwd, ~/.config/docpipe/)
|
|
29
|
+
4. Defaults
|
|
30
|
+
"""
|
|
31
|
+
yaml_overrides: dict[str, Any] = {}
|
|
32
|
+
|
|
33
|
+
if path is not None:
|
|
34
|
+
config_path = Path(path)
|
|
35
|
+
if config_path.exists():
|
|
36
|
+
yaml_overrides = _load_yaml(config_path)
|
|
37
|
+
logger.info("Loaded config from %s", config_path)
|
|
38
|
+
else:
|
|
39
|
+
logger.warning("Config file not found: %s", config_path)
|
|
40
|
+
else:
|
|
41
|
+
for candidate in DEFAULT_CONFIG_PATHS:
|
|
42
|
+
if candidate.exists():
|
|
43
|
+
yaml_overrides = _load_yaml(candidate)
|
|
44
|
+
logger.info("Loaded config from %s", candidate)
|
|
45
|
+
break
|
|
46
|
+
|
|
47
|
+
return DocpipeSettings(**yaml_overrides)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _load_yaml(path: Path) -> dict[str, Any]:
|
|
51
|
+
"""Load and parse a YAML config file."""
|
|
52
|
+
with open(path) as f:
|
|
53
|
+
data = yaml.safe_load(f)
|
|
54
|
+
return data if isinstance(data, dict) else {}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Pydantic Settings for docpipe configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import Field
|
|
8
|
+
from pydantic_settings import BaseSettings
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DocpipeSettings(BaseSettings):
|
|
12
|
+
"""Configuration loaded from env vars, YAML, or constructor."""
|
|
13
|
+
|
|
14
|
+
model_config = {"env_prefix": "DOCPIPE_", "env_nested_delimiter": "__"}
|
|
15
|
+
|
|
16
|
+
# Parser settings
|
|
17
|
+
default_parser: str = "docling"
|
|
18
|
+
parser_options: dict[str, Any] = Field(default_factory=dict)
|
|
19
|
+
|
|
20
|
+
# Extractor settings
|
|
21
|
+
default_extractor: str = "langextract"
|
|
22
|
+
extractor_options: dict[str, Any] = Field(default_factory=dict)
|
|
23
|
+
|
|
24
|
+
# Ingestion settings
|
|
25
|
+
db_connection_string: str | None = None
|
|
26
|
+
db_table_name: str = "docpipe_documents"
|
|
27
|
+
embedding_provider: str | None = None
|
|
28
|
+
embedding_model: str | None = None
|
|
29
|
+
chunk_size: int = 1000
|
|
30
|
+
chunk_overlap: int = 200
|
|
31
|
+
ingest_mode: str = "both"
|
|
32
|
+
|
|
33
|
+
# Server settings
|
|
34
|
+
server_host: str = "0.0.0.0"
|
|
35
|
+
server_port: int = 8000
|
|
36
|
+
|
|
37
|
+
# Pipeline settings
|
|
38
|
+
max_concurrency: int = 4
|
|
39
|
+
|
|
40
|
+
# Logging
|
|
41
|
+
log_level: str = "INFO"
|
docpipe/core/__init__.py
ADDED
|
File without changes
|
docpipe/core/errors.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Exception hierarchy for docpipe."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DocpipeError(Exception):
|
|
5
|
+
"""Base exception for all docpipe errors."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ParserNotFoundError(DocpipeError):
|
|
9
|
+
"""Raised when a requested parser is not registered."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ExtractorNotFoundError(DocpipeError):
|
|
13
|
+
"""Raised when a requested extractor is not registered."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ParserNotInstalledError(DocpipeError):
|
|
17
|
+
"""Raised when a parser's underlying library is not installed."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ExtractorNotInstalledError(DocpipeError):
|
|
21
|
+
"""Raised when an extractor's underlying library is not installed."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ParseError(DocpipeError):
|
|
25
|
+
"""Raised when document parsing fails."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ExtractionError(DocpipeError):
|
|
29
|
+
"""Raised when structured extraction fails."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class IngestionError(DocpipeError):
|
|
33
|
+
"""Raised when vector ingestion fails."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ConfigurationError(DocpipeError):
|
|
37
|
+
"""Raised for invalid configuration."""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class UnsupportedFormatError(DocpipeError):
|
|
41
|
+
"""Raised when a document format is not supported by the selected parser."""
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""BaseExtractor protocol for structured data extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
from docpipe.core.types import ExtractionResult, ExtractionSchema
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@runtime_checkable
|
|
11
|
+
class BaseExtractor(Protocol):
|
|
12
|
+
"""Protocol that all structured extractors must implement."""
|
|
13
|
+
|
|
14
|
+
name: str
|
|
15
|
+
|
|
16
|
+
def extract(
|
|
17
|
+
self,
|
|
18
|
+
text: str,
|
|
19
|
+
schema: ExtractionSchema,
|
|
20
|
+
**kwargs: object,
|
|
21
|
+
) -> list[ExtractionResult]:
|
|
22
|
+
"""Extract structured data from text."""
|
|
23
|
+
...
|
|
24
|
+
|
|
25
|
+
async def aextract(
|
|
26
|
+
self,
|
|
27
|
+
text: str,
|
|
28
|
+
schema: ExtractionSchema,
|
|
29
|
+
**kwargs: object,
|
|
30
|
+
) -> list[ExtractionResult]:
|
|
31
|
+
"""Async variant of extract."""
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def is_available(cls) -> bool:
|
|
36
|
+
"""Return True if the underlying library is installed."""
|
|
37
|
+
...
|
docpipe/core/parser.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""BaseParser protocol for document parsers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
from docpipe.core.types import ParsedDocument
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@runtime_checkable
|
|
11
|
+
class BaseParser(Protocol):
|
|
12
|
+
"""Protocol that all document parsers must implement."""
|
|
13
|
+
|
|
14
|
+
name: str
|
|
15
|
+
|
|
16
|
+
def parse(self, source: str, **kwargs: object) -> ParsedDocument:
|
|
17
|
+
"""Parse a single document from a file path or URL."""
|
|
18
|
+
...
|
|
19
|
+
|
|
20
|
+
async def aparse(self, source: str, **kwargs: object) -> ParsedDocument:
|
|
21
|
+
"""Async variant of parse."""
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
def parse_batch(self, sources: list[str], **kwargs: object) -> list[ParsedDocument]:
|
|
25
|
+
"""Parse multiple documents."""
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def is_available(cls) -> bool:
|
|
30
|
+
"""Return True if the underlying library is installed."""
|
|
31
|
+
...
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def supported_formats(cls) -> list[str]:
|
|
35
|
+
"""Return list of supported format strings."""
|
|
36
|
+
...
|