docpipe-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docpipe/__init__.py ADDED
@@ -0,0 +1,150 @@
1
+ """docpipe - Unified document parsing, structured extraction, and vector ingestion pipeline."""
2
+
3
+ from docpipe._version import __version__
4
+ from docpipe.core.errors import (
5
+ ConfigurationError,
6
+ DocpipeError,
7
+ ExtractionError,
8
+ ExtractorNotFoundError,
9
+ ExtractorNotInstalledError,
10
+ IngestionError,
11
+ ParseError,
12
+ ParserNotFoundError,
13
+ ParserNotInstalledError,
14
+ UnsupportedFormatError,
15
+ )
16
+ from docpipe.core.extractor import BaseExtractor
17
+ from docpipe.core.parser import BaseParser
18
+ from docpipe.core.pipeline import Pipeline
19
+ from docpipe.core.types import (
20
+ DocumentFormat,
21
+ ExtractionResult,
22
+ ExtractionSchema,
23
+ IngestionConfig,
24
+ IngestionResult,
25
+ PageContent,
26
+ ParsedDocument,
27
+ PipelineResult,
28
+ SourceSpan,
29
+ )
30
+ from docpipe.registry.registry import PluginRegistry
31
+
32
+
33
+ def _register_builtins() -> None:
34
+ """Register built-in parsers and extractors if their dependencies are available."""
35
+ registry = PluginRegistry.get()
36
+
37
+ try:
38
+ from docpipe.parsers.docling_parser import DoclingParser
39
+
40
+ registry.register_parser("docling", DoclingParser)
41
+ except ImportError:
42
+ pass
43
+
44
+ try:
45
+ from docpipe.extractors.langextract_extractor import LangExtractExtractor
46
+
47
+ registry.register_extractor("langextract", LangExtractExtractor)
48
+ except ImportError:
49
+ pass
50
+
51
+ try:
52
+ from docpipe.extractors.langchain_extractor import LangChainExtractor
53
+
54
+ registry.register_extractor("langchain", LangChainExtractor)
55
+ except ImportError:
56
+ pass
57
+
58
+
59
+ _register_builtins()
60
+
61
+
62
+ # --- Convenience functions ---
63
+
64
+
65
+ def parse(source: str, *, parser: str = "docling", **kwargs: object) -> ParsedDocument:
66
+ """Parse a document using the specified parser."""
67
+ p = PluginRegistry.get().get_parser(parser, **kwargs)
68
+ return p.parse(source)
69
+
70
+
71
+ def extract(
72
+ text: str,
73
+ schema: ExtractionSchema,
74
+ *,
75
+ extractor: str = "langextract",
76
+ **kwargs: object,
77
+ ) -> list[ExtractionResult]:
78
+ """Extract structured data using the specified extractor."""
79
+ e = PluginRegistry.get().get_extractor(extractor, **kwargs)
80
+ return e.extract(text, schema)
81
+
82
+
83
+ def run(
84
+ source: str,
85
+ schema: ExtractionSchema,
86
+ *,
87
+ parser: str = "docling",
88
+ extractor: str = "langextract",
89
+ ingestion_config: IngestionConfig | None = None,
90
+ ) -> PipelineResult:
91
+ """Run the full pipeline: parse + extract, optionally ingest."""
92
+ pipeline = Pipeline(
93
+ parser=parser,
94
+ extractor=extractor,
95
+ ingestion_config=ingestion_config,
96
+ )
97
+ return pipeline.run(source, schema)
98
+
99
+
100
+ def ingest(
101
+ source: str,
102
+ *,
103
+ config: IngestionConfig,
104
+ parser: str = "docling",
105
+ ) -> IngestionResult:
106
+ """Parse a document and ingest it into a vector store."""
107
+ from docpipe.ingestion.pipeline import IngestionPipeline
108
+
109
+ p = PluginRegistry.get().get_parser(parser)
110
+ parsed = p.parse(source)
111
+ ingestion = IngestionPipeline(config)
112
+ return ingestion.ingest(parsed)
113
+
114
+
115
+ __all__ = [
116
+ "__version__",
117
+ # Core types
118
+ "DocumentFormat",
119
+ "ExtractionResult",
120
+ "ExtractionSchema",
121
+ "IngestionConfig",
122
+ "IngestionResult",
123
+ "PageContent",
124
+ "ParsedDocument",
125
+ "PipelineResult",
126
+ "SourceSpan",
127
+ # Protocols
128
+ "BaseExtractor",
129
+ "BaseParser",
130
+ # Pipeline
131
+ "Pipeline",
132
+ # Registry
133
+ "PluginRegistry",
134
+ # Errors
135
+ "ConfigurationError",
136
+ "DocpipeError",
137
+ "ExtractionError",
138
+ "ExtractorNotFoundError",
139
+ "ExtractorNotInstalledError",
140
+ "IngestionError",
141
+ "ParseError",
142
+ "ParserNotFoundError",
143
+ "ParserNotInstalledError",
144
+ "UnsupportedFormatError",
145
+ # Convenience functions
146
+ "extract",
147
+ "ingest",
148
+ "parse",
149
+ "run",
150
+ ]
docpipe/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
File without changes
docpipe/cli/main.py ADDED
@@ -0,0 +1,308 @@
1
+ """CLI interface for docpipe."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ import click
11
+
12
+ from docpipe._version import __version__
13
+
14
+
15
+ @click.group()
16
+ @click.version_option(__version__, prog_name="docpipe")
17
+ @click.option("--log-level", default="INFO", help="Logging level")
18
+ def cli(log_level: str) -> None:
19
+ """docpipe - Unified document parsing, extraction, and ingestion pipeline."""
20
+ logging.basicConfig(
21
+ level=getattr(logging, log_level.upper(), logging.INFO),
22
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
23
+ )
24
+
25
+
26
+ @cli.command()
27
+ @click.argument("file")
28
+ @click.option("--parser", default="docling", help="Parser to use")
29
+ @click.option(
30
+ "--format", "output_format", default="markdown",
31
+ type=click.Choice(["markdown", "text", "json"]),
32
+ )
33
+ @click.option("--output", "-o", default=None, help="Output file (default: stdout)")
34
+ def parse(file: str, parser: str, output_format: str, output: str | None) -> None:
35
+ """Parse a document into structured text."""
36
+ from docpipe.registry.registry import PluginRegistry
37
+
38
+ p = PluginRegistry.get().get_parser(parser)
39
+ result = p.parse(file)
40
+
41
+ if output_format == "markdown":
42
+ content = result.markdown or result.text
43
+ elif output_format == "text":
44
+ content = result.text
45
+ else:
46
+ content = result.model_dump_json(indent=2)
47
+
48
+ if output:
49
+ Path(output).write_text(content)
50
+ click.echo(f"Output written to {output}")
51
+ else:
52
+ click.echo(content)
53
+
54
+
55
+ @cli.command()
56
+ @click.argument("text_or_file")
57
+ @click.option("--schema", "schema_file", required=True, help="Schema YAML file")
58
+ @click.option("--extractor", default="langextract", help="Extractor to use")
59
+ @click.option("--model", "model_id", required=True, help="LLM model ID")
60
+ @click.option("--output", "-o", default=None, help="Output file (default: stdout)")
61
+ def extract(
62
+ text_or_file: str,
63
+ schema_file: str,
64
+ extractor: str,
65
+ model_id: str,
66
+ output: str | None,
67
+ ) -> None:
68
+ """Extract structured data from text or a file."""
69
+ import yaml
70
+
71
+ from docpipe.core.types import ExtractionSchema
72
+ from docpipe.registry.registry import PluginRegistry
73
+
74
+ # Load schema
75
+ with open(schema_file) as f:
76
+ schema_data = yaml.safe_load(f)
77
+ schema_data["model_id"] = model_id
78
+ schema = ExtractionSchema(**schema_data)
79
+
80
+ # Determine if input is a file or text
81
+ text = text_or_file
82
+ if Path(text_or_file).exists():
83
+ text = Path(text_or_file).read_text()
84
+
85
+ e = PluginRegistry.get().get_extractor(extractor)
86
+ results = e.extract(text, schema)
87
+
88
+ output_data = [r.model_dump() for r in results]
89
+ content = json.dumps(output_data, indent=2, default=str)
90
+
91
+ if output:
92
+ Path(output).write_text(content)
93
+ click.echo(f"Output written to {output}")
94
+ else:
95
+ click.echo(content)
96
+
97
+
98
+ @cli.command("run")
99
+ @click.argument("file")
100
+ @click.option("--schema", "schema_file", required=True, help="Schema YAML file")
101
+ @click.option("--parser", default="docling", help="Parser to use")
102
+ @click.option("--extractor", default="langextract", help="Extractor to use")
103
+ @click.option("--model", "model_id", required=True, help="LLM model ID")
104
+ @click.option("--output", "-o", default=None, help="Output file (default: stdout)")
105
+ def run_pipeline(
106
+ file: str,
107
+ schema_file: str,
108
+ parser: str,
109
+ extractor: str,
110
+ model_id: str,
111
+ output: str | None,
112
+ ) -> None:
113
+ """Run the full pipeline: parse + extract."""
114
+ import yaml
115
+
116
+ from docpipe.core.pipeline import Pipeline
117
+ from docpipe.core.types import ExtractionSchema
118
+
119
+ with open(schema_file) as f:
120
+ schema_data = yaml.safe_load(f)
121
+ schema_data["model_id"] = model_id
122
+ schema = ExtractionSchema(**schema_data)
123
+
124
+ pipeline = Pipeline(parser=parser, extractor=extractor)
125
+ result = pipeline.run(file, schema)
126
+
127
+ content = result.model_dump_json(indent=2)
128
+
129
+ if output:
130
+ Path(output).write_text(content)
131
+ click.echo(f"Output written to {output}")
132
+ else:
133
+ click.echo(content)
134
+
135
+
136
+ @cli.command()
137
+ @click.argument("file")
138
+ @click.option("--db", required=True, help="Database connection string")
139
+ @click.option("--table", required=True, help="Target table name")
140
+ @click.option(
141
+ "--embedding-provider", required=True,
142
+ help="Embedding provider (openai, ollama, huggingface, google)",
143
+ )
144
+ @click.option("--embedding-model", required=True, help="Embedding model name")
145
+ @click.option("--mode", default="both", type=click.Choice(["chunks", "extractions", "both"]))
146
+ @click.option("--chunk-size", default=1000, help="Chunk size for text splitting")
147
+ @click.option("--chunk-overlap", default=200, help="Chunk overlap")
148
+ @click.option("--parser", default="docling", help="Parser to use")
149
+ def ingest(
150
+ file: str,
151
+ db: str,
152
+ table: str,
153
+ embedding_provider: str,
154
+ embedding_model: str,
155
+ mode: str,
156
+ chunk_size: int,
157
+ chunk_overlap: int,
158
+ parser: str,
159
+ ) -> None:
160
+ """Parse a document and ingest into a vector database."""
161
+ from docpipe.core.types import IngestionConfig
162
+ from docpipe.ingestion.pipeline import IngestionPipeline
163
+ from docpipe.registry.registry import PluginRegistry
164
+
165
+ config = IngestionConfig(
166
+ connection_string=db,
167
+ table_name=table,
168
+ embedding_provider=embedding_provider,
169
+ embedding_model=embedding_model,
170
+ chunk_size=chunk_size,
171
+ chunk_overlap=chunk_overlap,
172
+ ingest_mode=mode,
173
+ )
174
+
175
+ p = PluginRegistry.get().get_parser(parser)
176
+ parsed = p.parse(file)
177
+
178
+ ingestion = IngestionPipeline(config)
179
+ result = ingestion.ingest(parsed)
180
+
181
+ click.echo(f"Ingested {result.chunks_ingested} chunks into '{result.table_name}'")
182
+ if result.table_created:
183
+ click.echo("Table was created.")
184
+
185
+
186
+ @cli.command()
187
+ @click.argument("query")
188
+ @click.option("--db", required=True, help="Database connection string")
189
+ @click.option("--table", required=True, help="Table name to search")
190
+ @click.option("--embedding-provider", required=True, help="Embedding provider")
191
+ @click.option("--embedding-model", required=True, help="Embedding model name")
192
+ @click.option("--top-k", default=10, help="Number of results")
193
+ def search(
194
+ query: str,
195
+ db: str,
196
+ table: str,
197
+ embedding_provider: str,
198
+ embedding_model: str,
199
+ top_k: int,
200
+ ) -> None:
201
+ """Search for similar documents in the vector database."""
202
+ from docpipe.core.types import IngestionConfig
203
+ from docpipe.ingestion.pipeline import IngestionPipeline
204
+
205
+ config = IngestionConfig(
206
+ connection_string=db,
207
+ table_name=table,
208
+ embedding_provider=embedding_provider,
209
+ embedding_model=embedding_model,
210
+ )
211
+
212
+ ingestion = IngestionPipeline(config)
213
+ results = ingestion.search(query, top_k=top_k)
214
+
215
+ for i, r in enumerate(results, 1):
216
+ click.echo(f"\n--- Result {i} (score: {r['score']:.4f}) ---")
217
+ click.echo(r["content"][:500])
218
+ if r["metadata"]:
219
+ click.echo(f"Metadata: {json.dumps(r['metadata'], default=str)}")
220
+
221
+
222
+ @cli.command()
223
+ @click.option("--host", default="0.0.0.0", help="Server host")
224
+ @click.option("--port", default=8000, help="Server port")
225
+ @click.option("--reload", is_flag=True, help="Enable auto-reload for development")
226
+ def serve(host: str, port: int, reload: bool) -> None:
227
+ """Start the docpipe API server."""
228
+ try:
229
+ import uvicorn
230
+ except ImportError:
231
+ click.echo("Server requires uvicorn. Install with: pip install docpipe[server]", err=True)
232
+ sys.exit(1)
233
+
234
+ click.echo(f"Starting docpipe server on {host}:{port}")
235
+ uvicorn.run("docpipe.server.app:app", host=host, port=port, reload=reload)
236
+
237
+
238
+ @cli.group()
239
+ def plugins() -> None:
240
+ """Manage plugins."""
241
+
242
+
243
+ @plugins.command("list")
244
+ def plugins_list() -> None:
245
+ """List all registered plugins."""
246
+ from docpipe.registry.registry import PluginRegistry
247
+
248
+ registry = PluginRegistry.get()
249
+
250
+ click.echo("Parsers:")
251
+ for name in registry.list_parsers():
252
+ info = registry.parser_info(name)
253
+ available = "available" if info.get("available") else "not installed"
254
+ click.echo(f" - {name} ({available})")
255
+
256
+ click.echo("\nExtractors:")
257
+ for name in registry.list_extractors():
258
+ info = registry.extractor_info(name)
259
+ available = "available" if info.get("available") else "not installed"
260
+ click.echo(f" - {name} ({available})")
261
+
262
+
263
+ @cli.group()
264
+ def config() -> None:
265
+ """Manage configuration."""
266
+
267
+
268
+ @config.command("init")
269
+ @click.option("--output", "-o", default="docpipe.yaml", help="Output file path")
270
+ def config_init(output: str) -> None:
271
+ """Generate a template configuration file."""
272
+ template = """\
273
+ # docpipe configuration
274
+ # See https://github.com/thesunnysinha/docpipe for documentation
275
+
276
+ # Parser settings
277
+ default_parser: docling
278
+ parser_options: {}
279
+
280
+ # Extractor settings
281
+ default_extractor: langextract
282
+ extractor_options: {}
283
+
284
+ # Ingestion settings (provide your own DB connection)
285
+ # db_connection_string: postgresql://user:pass@host:5432/dbname
286
+ # db_table_name: docpipe_documents
287
+ # embedding_provider: openai
288
+ # embedding_model: text-embedding-3-small
289
+ # chunk_size: 1000
290
+ # chunk_overlap: 200
291
+ # ingest_mode: both
292
+
293
+ # Server settings
294
+ server_host: "0.0.0.0"
295
+ server_port: 8000
296
+
297
+ # Pipeline settings
298
+ max_concurrency: 4
299
+
300
+ # Logging
301
+ log_level: INFO
302
+ """
303
+ Path(output).write_text(template)
304
+ click.echo(f"Config template written to {output}")
305
+
306
+
307
+ if __name__ == "__main__":
308
+ cli()
File without changes
@@ -0,0 +1,54 @@
1
+ """YAML and environment-based configuration loader."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import yaml
10
+
11
+ from docpipe.config.settings import DocpipeSettings
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ DEFAULT_CONFIG_PATHS = [
16
+ Path("docpipe.yaml"),
17
+ Path("docpipe.yml"),
18
+ Path.home() / ".config" / "docpipe" / "config.yaml",
19
+ ]
20
+
21
+
22
+ def load_config(path: str | Path | None = None) -> DocpipeSettings:
23
+ """Load configuration from YAML file, env vars, or defaults.
24
+
25
+ Priority (highest to lowest):
26
+ 1. Environment variables (DOCPIPE_*)
27
+ 2. Explicit config file path
28
+ 3. Auto-discovered config files (docpipe.yaml in cwd, ~/.config/docpipe/)
29
+ 4. Defaults
30
+ """
31
+ yaml_overrides: dict[str, Any] = {}
32
+
33
+ if path is not None:
34
+ config_path = Path(path)
35
+ if config_path.exists():
36
+ yaml_overrides = _load_yaml(config_path)
37
+ logger.info("Loaded config from %s", config_path)
38
+ else:
39
+ logger.warning("Config file not found: %s", config_path)
40
+ else:
41
+ for candidate in DEFAULT_CONFIG_PATHS:
42
+ if candidate.exists():
43
+ yaml_overrides = _load_yaml(candidate)
44
+ logger.info("Loaded config from %s", candidate)
45
+ break
46
+
47
+ return DocpipeSettings(**yaml_overrides)
48
+
49
+
50
+ def _load_yaml(path: Path) -> dict[str, Any]:
51
+ """Load and parse a YAML config file."""
52
+ with open(path) as f:
53
+ data = yaml.safe_load(f)
54
+ return data if isinstance(data, dict) else {}
@@ -0,0 +1,41 @@
1
+ """Pydantic Settings for docpipe configuration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from pydantic import Field
8
+ from pydantic_settings import BaseSettings
9
+
10
+
11
+ class DocpipeSettings(BaseSettings):
12
+ """Configuration loaded from env vars, YAML, or constructor."""
13
+
14
+ model_config = {"env_prefix": "DOCPIPE_", "env_nested_delimiter": "__"}
15
+
16
+ # Parser settings
17
+ default_parser: str = "docling"
18
+ parser_options: dict[str, Any] = Field(default_factory=dict)
19
+
20
+ # Extractor settings
21
+ default_extractor: str = "langextract"
22
+ extractor_options: dict[str, Any] = Field(default_factory=dict)
23
+
24
+ # Ingestion settings
25
+ db_connection_string: str | None = None
26
+ db_table_name: str = "docpipe_documents"
27
+ embedding_provider: str | None = None
28
+ embedding_model: str | None = None
29
+ chunk_size: int = 1000
30
+ chunk_overlap: int = 200
31
+ ingest_mode: str = "both"
32
+
33
+ # Server settings
34
+ server_host: str = "0.0.0.0"
35
+ server_port: int = 8000
36
+
37
+ # Pipeline settings
38
+ max_concurrency: int = 4
39
+
40
+ # Logging
41
+ log_level: str = "INFO"
File without changes
docpipe/core/errors.py ADDED
@@ -0,0 +1,41 @@
1
+ """Exception hierarchy for docpipe."""
2
+
3
+
4
+ class DocpipeError(Exception):
5
+ """Base exception for all docpipe errors."""
6
+
7
+
8
+ class ParserNotFoundError(DocpipeError):
9
+ """Raised when a requested parser is not registered."""
10
+
11
+
12
+ class ExtractorNotFoundError(DocpipeError):
13
+ """Raised when a requested extractor is not registered."""
14
+
15
+
16
+ class ParserNotInstalledError(DocpipeError):
17
+ """Raised when a parser's underlying library is not installed."""
18
+
19
+
20
+ class ExtractorNotInstalledError(DocpipeError):
21
+ """Raised when an extractor's underlying library is not installed."""
22
+
23
+
24
+ class ParseError(DocpipeError):
25
+ """Raised when document parsing fails."""
26
+
27
+
28
+ class ExtractionError(DocpipeError):
29
+ """Raised when structured extraction fails."""
30
+
31
+
32
+ class IngestionError(DocpipeError):
33
+ """Raised when vector ingestion fails."""
34
+
35
+
36
+ class ConfigurationError(DocpipeError):
37
+ """Raised for invalid configuration."""
38
+
39
+
40
+ class UnsupportedFormatError(DocpipeError):
41
+ """Raised when a document format is not supported by the selected parser."""
@@ -0,0 +1,37 @@
1
+ """BaseExtractor protocol for structured data extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol, runtime_checkable
6
+
7
+ from docpipe.core.types import ExtractionResult, ExtractionSchema
8
+
9
+
10
+ @runtime_checkable
11
+ class BaseExtractor(Protocol):
12
+ """Protocol that all structured extractors must implement."""
13
+
14
+ name: str
15
+
16
+ def extract(
17
+ self,
18
+ text: str,
19
+ schema: ExtractionSchema,
20
+ **kwargs: object,
21
+ ) -> list[ExtractionResult]:
22
+ """Extract structured data from text."""
23
+ ...
24
+
25
+ async def aextract(
26
+ self,
27
+ text: str,
28
+ schema: ExtractionSchema,
29
+ **kwargs: object,
30
+ ) -> list[ExtractionResult]:
31
+ """Async variant of extract."""
32
+ ...
33
+
34
+ @classmethod
35
+ def is_available(cls) -> bool:
36
+ """Return True if the underlying library is installed."""
37
+ ...
docpipe/core/parser.py ADDED
@@ -0,0 +1,36 @@
1
+ """BaseParser protocol for document parsers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol, runtime_checkable
6
+
7
+ from docpipe.core.types import ParsedDocument
8
+
9
+
10
+ @runtime_checkable
11
+ class BaseParser(Protocol):
12
+ """Protocol that all document parsers must implement."""
13
+
14
+ name: str
15
+
16
+ def parse(self, source: str, **kwargs: object) -> ParsedDocument:
17
+ """Parse a single document from a file path or URL."""
18
+ ...
19
+
20
+ async def aparse(self, source: str, **kwargs: object) -> ParsedDocument:
21
+ """Async variant of parse."""
22
+ ...
23
+
24
+ def parse_batch(self, sources: list[str], **kwargs: object) -> list[ParsedDocument]:
25
+ """Parse multiple documents."""
26
+ ...
27
+
28
+ @classmethod
29
+ def is_available(cls) -> bool:
30
+ """Return True if the underlying library is installed."""
31
+ ...
32
+
33
+ @classmethod
34
+ def supported_formats(cls) -> list[str]:
35
+ """Return list of supported format strings."""
36
+ ...