docling-graph 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_graph-0.2.4/LICENSE +21 -0
- docling_graph-0.2.4/PKG-INFO +310 -0
- docling_graph-0.2.4/README.md +255 -0
- docling_graph-0.2.4/docling_graph/__init__.py +14 -0
- docling_graph-0.2.4/docling_graph/cli/__init__.py +0 -0
- docling_graph-0.2.4/docling_graph/cli/commands/__init__.py +8 -0
- docling_graph-0.2.4/docling_graph/cli/commands/convert.py +212 -0
- docling_graph-0.2.4/docling_graph/cli/commands/init.py +85 -0
- docling_graph-0.2.4/docling_graph/cli/commands/inspect.py +117 -0
- docling_graph-0.2.4/docling_graph/cli/config_builder.py +330 -0
- docling_graph-0.2.4/docling_graph/cli/config_utils.py +84 -0
- docling_graph-0.2.4/docling_graph/cli/constants.py +49 -0
- docling_graph-0.2.4/docling_graph/cli/dependencies.py +213 -0
- docling_graph-0.2.4/docling_graph/cli/main.py +37 -0
- docling_graph-0.2.4/docling_graph/cli/validators.py +226 -0
- docling_graph-0.2.4/docling_graph/config.py +191 -0
- docling_graph-0.2.4/docling_graph/core/__init__.py +34 -0
- docling_graph-0.2.4/docling_graph/core/converters/__init__.py +0 -0
- docling_graph-0.2.4/docling_graph/core/converters/config.py +48 -0
- docling_graph-0.2.4/docling_graph/core/converters/graph_converter.py +278 -0
- docling_graph-0.2.4/docling_graph/core/converters/models.py +48 -0
- docling_graph-0.2.4/docling_graph/core/converters/node_id_registry.py +142 -0
- docling_graph-0.2.4/docling_graph/core/exporters/__init__.py +8 -0
- docling_graph-0.2.4/docling_graph/core/exporters/base.py +31 -0
- docling_graph-0.2.4/docling_graph/core/exporters/csv_exporter.py +88 -0
- docling_graph-0.2.4/docling_graph/core/exporters/cypher_exporter.py +170 -0
- docling_graph-0.2.4/docling_graph/core/exporters/docling_exporter.py +101 -0
- docling_graph-0.2.4/docling_graph/core/exporters/json_exporter.py +87 -0
- docling_graph-0.2.4/docling_graph/core/extractors/__init__.py +4 -0
- docling_graph-0.2.4/docling_graph/core/extractors/backends/__init__.py +0 -0
- docling_graph-0.2.4/docling_graph/core/extractors/backends/llm_backend.py +182 -0
- docling_graph-0.2.4/docling_graph/core/extractors/backends/vlm_backend.py +150 -0
- docling_graph-0.2.4/docling_graph/core/extractors/chunk_batcher.py +256 -0
- docling_graph-0.2.4/docling_graph/core/extractors/document_chunker.py +250 -0
- docling_graph-0.2.4/docling_graph/core/extractors/document_processor.py +276 -0
- docling_graph-0.2.4/docling_graph/core/extractors/extractor_base.py +27 -0
- docling_graph-0.2.4/docling_graph/core/extractors/factory.py +96 -0
- docling_graph-0.2.4/docling_graph/core/extractors/strategies/__init__.py +0 -0
- docling_graph-0.2.4/docling_graph/core/extractors/strategies/many_to_one.py +450 -0
- docling_graph-0.2.4/docling_graph/core/extractors/strategies/one_to_one.py +114 -0
- docling_graph-0.2.4/docling_graph/core/utils/__init__.py +0 -0
- docling_graph-0.2.4/docling_graph/core/utils/dict_merger.py +144 -0
- docling_graph-0.2.4/docling_graph/core/utils/graph_cleaner.py +298 -0
- docling_graph-0.2.4/docling_graph/core/utils/stats_calculator.py +71 -0
- docling_graph-0.2.4/docling_graph/core/utils/string_formatter.py +94 -0
- docling_graph-0.2.4/docling_graph/core/visualizers/__init__.py +6 -0
- docling_graph-0.2.4/docling_graph/core/visualizers/base.py +32 -0
- docling_graph-0.2.4/docling_graph/core/visualizers/interactive_visualizer.py +324 -0
- docling_graph-0.2.4/docling_graph/core/visualizers/report_generator.py +153 -0
- docling_graph-0.2.4/docling_graph/db_clients/__init__.py +0 -0
- docling_graph-0.2.4/docling_graph/llm_clients/__init__.py +128 -0
- docling_graph-0.2.4/docling_graph/llm_clients/base.py +38 -0
- docling_graph-0.2.4/docling_graph/llm_clients/config.py +547 -0
- docling_graph-0.2.4/docling_graph/llm_clients/gemini.py +125 -0
- docling_graph-0.2.4/docling_graph/llm_clients/mistral.py +141 -0
- docling_graph-0.2.4/docling_graph/llm_clients/ollama.py +122 -0
- docling_graph-0.2.4/docling_graph/llm_clients/openai.py +124 -0
- docling_graph-0.2.4/docling_graph/llm_clients/prompts.py +155 -0
- docling_graph-0.2.4/docling_graph/llm_clients/vllm.py +142 -0
- docling_graph-0.2.4/docling_graph/llm_clients/watsonx.py +214 -0
- docling_graph-0.2.4/docling_graph/pipeline.py +283 -0
- docling_graph-0.2.4/docling_graph/protocols.py +260 -0
- docling_graph-0.2.4/docling_graph.egg-info/PKG-INFO +310 -0
- docling_graph-0.2.4/docling_graph.egg-info/SOURCES.txt +68 -0
- docling_graph-0.2.4/docling_graph.egg-info/dependency_links.txt +1 -0
- docling_graph-0.2.4/docling_graph.egg-info/entry_points.txt +2 -0
- docling_graph-0.2.4/docling_graph.egg-info/requires.txt +39 -0
- docling_graph-0.2.4/docling_graph.egg-info/top_level.txt +1 -0
- docling_graph-0.2.4/pyproject.toml +258 -0
- docling_graph-0.2.4/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 International Business Machines
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docling_graph
|
|
3
|
+
Version: 0.2.4
|
|
4
|
+
Summary: A tool to convert documents into knowledge graphs using Docling.
|
|
5
|
+
Author-email: Michele Dolfi <dol@zurich.ibm.com>, Ayoub El Bouchtili <ayoub.elbouchtili@fr.ibm.com>, Maxime Gillot <Maxime.Gillot@ibm.com>, Sophie Lang <sophie.lang@de.ibm.com>, Guilhaume Leroy Meline <guilhaume@fr.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: homepage, https://github.com/ayoub-ibm/docling-graph
|
|
8
|
+
Project-URL: repository, https://github.com/ayoub-ibm/docling-graph
|
|
9
|
+
Project-URL: issues, https://github.com/ayoub-ibm/docling-graph/issues
|
|
10
|
+
Project-URL: changelog, https://github.com/ayoub-ibm/docling-graph/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: docling,knowledge-graph,nlp,pdf,graph
|
|
12
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
13
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
14
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Requires-Python: <4.0,>=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: docling[vlm]<3.0.0,>=2.60.0
|
|
25
|
+
Requires-Dist: docling-core[chunking,chunking-openai]<3.0.0,>=2.50.0
|
|
26
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
27
|
+
Requires-Dist: networkx<4.0.0,>=3.0.0
|
|
28
|
+
Requires-Dist: rich<15,>=13
|
|
29
|
+
Requires-Dist: typer[all]<1.0.0,>=0.12
|
|
30
|
+
Requires-Dist: python-dotenv<2.0,>=1.0
|
|
31
|
+
Provides-Extra: ollama
|
|
32
|
+
Requires-Dist: ollama; extra == "ollama"
|
|
33
|
+
Provides-Extra: vllm
|
|
34
|
+
Requires-Dist: openai; extra == "vllm"
|
|
35
|
+
Provides-Extra: gemini
|
|
36
|
+
Requires-Dist: google-genai; extra == "gemini"
|
|
37
|
+
Provides-Extra: mistral
|
|
38
|
+
Requires-Dist: mistralai; extra == "mistral"
|
|
39
|
+
Provides-Extra: openai
|
|
40
|
+
Requires-Dist: openai; extra == "openai"
|
|
41
|
+
Provides-Extra: watsonx
|
|
42
|
+
Requires-Dist: ibm-watsonx-ai; extra == "watsonx"
|
|
43
|
+
Provides-Extra: local
|
|
44
|
+
Requires-Dist: docling-graph[ollama]; extra == "local"
|
|
45
|
+
Requires-Dist: docling-graph[vllm]; extra == "local"
|
|
46
|
+
Provides-Extra: remote
|
|
47
|
+
Requires-Dist: docling-graph[openai]; extra == "remote"
|
|
48
|
+
Requires-Dist: docling-graph[mistral]; extra == "remote"
|
|
49
|
+
Requires-Dist: docling-graph[gemini]; extra == "remote"
|
|
50
|
+
Requires-Dist: docling-graph[watsonx]; extra == "remote"
|
|
51
|
+
Provides-Extra: all
|
|
52
|
+
Requires-Dist: docling-graph[local]; extra == "all"
|
|
53
|
+
Requires-Dist: docling-graph[remote]; extra == "all"
|
|
54
|
+
Dynamic: license-file
|
|
55
|
+
|
|
56
|
+
<p align="center"><br>
|
|
57
|
+
<a href="https://github.com/IBM/docling-graph">
|
|
58
|
+
<img loading="lazy" alt="Docling Graph" src="docs/assets/logo.png" width="280"/>
|
|
59
|
+
</a>
|
|
60
|
+
</p>
|
|
61
|
+
|
|
62
|
+
# Docling Graph
|
|
63
|
+
|
|
64
|
+
[](https://ibm.github.io/docling-graph)
|
|
65
|
+
[](https://github.com/docling-project/docling)
|
|
66
|
+
[](https://pypi.org/project/docling-graph/)
|
|
67
|
+
[](https://www.python.org/downloads/)
|
|
68
|
+
[](https://github.com/astral-sh/uv)
|
|
69
|
+
[](https://github.com/astral-sh/ruff)
|
|
70
|
+
[](https://networkx.org/)
|
|
71
|
+
[](https://pydantic.dev)
|
|
72
|
+
[](https://typer.tiangolo.com/)
|
|
73
|
+
[](https://github.com/Textualize/rich)
|
|
74
|
+
[](https://vllm.ai/)
|
|
75
|
+
[](https://ollama.ai/)
|
|
76
|
+
[](https://lfaidata.foundation/projects/)
|
|
77
|
+
[](https://opensource.org/licenses/MIT)
|
|
78
|
+
[](https://www.bestpractices.dev/projects/11598)
|
|
79
|
+
|
|
80
|
+
Docling-Graph converts documents into validated **Pydantic** objects and then into a **directed knowledge graph**, with exports to CSV or Cypher and both static and interactive visualizations.
|
|
81
|
+
|
|
82
|
+
This transformation of unstructured documents into validated knowledge graphs with precise semantic relationships—essential for complex domains like **chemistry, finance, and physics** where AI systems must understand exact entity connections (e.g., chemical compounds and their reactions, financial instruments and their dependencies, physical properties and their measurements) rather than approximate text vectors, **enabling explainable reasoning over technical document collections**.
|
|
83
|
+
|
|
84
|
+
The toolkit supports two extraction families: **local VLM** via Docling and **LLM-based extraction** via local (vLLM, Ollama) or API providers (Mistral, OpenAI, Gemini, IBM WatsonX), all orchestrated by a flexible, config-driven pipeline.
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
## Key Capabilities
|
|
89
|
+
|
|
90
|
+
- **🧠 Extraction**:
|
|
91
|
+
- Local `VLM` (Docling's information extraction pipeline - ideal for small documents with key-value focus)
|
|
92
|
+
- `LLM` (local via vLLM/Ollama or remote via Mistral/OpenAI/Gemini/IBM WatsonX API)
|
|
93
|
+
- `Hybrid Chunking` Leveraging Docling's segmentation with semantic LLM chunking for more context-aware extraction
|
|
94
|
+
- `Page-wise` or `whole-document` conversion strategies for flexible processing
|
|
95
|
+
- **🔨 Graph Construction**:
|
|
96
|
+
- Markdown to Graph: Convert validated Pydantic instances to a `NetworkX DiGraph` with rich edge metadata and stable node IDs
|
|
97
|
+
- Smart Merge: Combine multi-page documents into a single Pydantic instance for unified processing
|
|
98
|
+
- Modular graph module with enhanced type safety and configuration
|
|
99
|
+
- **📦 Export**:
|
|
100
|
+
- `Docling Document` exports (JSON format with full document structure)
|
|
101
|
+
- `Markdown` exports (full document and per-page options)
|
|
102
|
+
- `CSV` compatible with `Neo4j` admin import
|
|
103
|
+
- `Cypher` script generation for bulk ingestion
|
|
104
|
+
- `JSON` export for general-purpose graph data
|
|
105
|
+
- **📊 Visualization**:
|
|
106
|
+
- Interactive `HTML` visualization in full-page browser view with enhanced node/edge exploration
|
|
107
|
+
- Detailed `MARKDOWN` report with graph nodes content and edges
|
|
108
|
+
|
|
109
|
+
### Coming Soon
|
|
110
|
+
|
|
111
|
+
* 🪜 **Multi-Stage Extraction:** Define `extraction_stage` in templates to control multi-pass extraction.
|
|
112
|
+
* 🧩 **Interactive Template Builder:** Guided workflows for building Pydantic templates.
|
|
113
|
+
* 🧬 **Ontology-Based Templates:** Match content to the best Pydantic template using semantic similarity.
|
|
114
|
+
* ✍🏻 **Flexible Inputs:** Accepts `text`, `markdown`, and `DoclingDocument` directly.
|
|
115
|
+
* ⚡ **Batch Optimization:** Faster GPU inference with better memory handling.
|
|
116
|
+
* 💾 **Graph Database Integration:** Export data straight into `Neo4j`, `ArangoDB`, and similar databases.
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
## Initial Setup
|
|
121
|
+
|
|
122
|
+
### Requirements
|
|
123
|
+
|
|
124
|
+
- Python 3.10 or higher
|
|
125
|
+
- UV package manager
|
|
126
|
+
|
|
127
|
+
### Installation
|
|
128
|
+
|
|
129
|
+
#### 1. Clone the Repository
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
git clone https://github.com/IBM/docling-graph
|
|
133
|
+
cd docling-graph
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
#### 2. Install Dependencies
|
|
137
|
+
|
|
138
|
+
Choose the installation option that matches your use case:
|
|
139
|
+
|
|
140
|
+
| Option | Command | Description |
|
|
141
|
+
| :--- | :--- | :--- |
|
|
142
|
+
| **Minimal** | `uv sync` | Includes core VLM features (Docling), **no** LLM inference |
|
|
143
|
+
| **Full** | `uv sync --extra all` | Includes **all** features, VLM, and all local/remote LLM providers |
|
|
144
|
+
| **Local LLM** | `uv sync --extra local` | Adds support for vLLM and Ollama (requires GPU for vLLM) |
|
|
145
|
+
| **Remote API** | `uv sync --extra remote` | Adds support for Mistral, OpenAI, Gemini, and IBM WatsonX APIs |
|
|
146
|
+
| **WatsonX** | `uv sync --extra watsonx` | Adds support for IBM WatsonX foundation models (Granite, Llama, Mixtral) |
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
#### 3. OPTIONAL - GPU Support (PyTorch)
|
|
150
|
+
|
|
151
|
+
Follow the steps in [this guide](docs/guides/setup_with_gpu_support.md) to install PyTorch with NVIDIA GPU (CUDA) support.
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
### API Key Setup (for Remote Inference)
|
|
156
|
+
|
|
157
|
+
If you're using remote/cloud inference, set your API keys for the providers you plan to use:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
export OPENAI_API_KEY="..." # OpenAI
|
|
161
|
+
export MISTRAL_API_KEY="..." # Mistral
|
|
162
|
+
export GEMINI_API_KEY="..." # Google Gemini
|
|
163
|
+
export WATSONX_API_KEY="..." # IBM WatsonX
|
|
164
|
+
export WATSONX_PROJECT_ID="..." # IBM WatsonX Project ID
|
|
165
|
+
export WATSONX_URL="..." # IBM WatsonX URL (optional, defaults to US South)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
On Windows, replace `export` with `set` in Command Prompt or `$env:` in PowerShell.
|
|
169
|
+
|
|
170
|
+
Alternatively, add them to your `.env` file.
|
|
171
|
+
|
|
172
|
+
**Note:** For IBM WatsonX setup and available models, see the [WatsonX Integration Guide](docs/guides/watsonx_integration.md).
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
## Getting Started
|
|
177
|
+
|
|
178
|
+
Docling Graph is primarily driven by its **CLI**, but you can easily integrate the core pipeline into Python scripts.
|
|
179
|
+
|
|
180
|
+
### 1. Python Example
|
|
181
|
+
|
|
182
|
+
To run a conversion programmatically, you define a configuration dictionary and pass it to the `run_pipeline` function. This example uses a **remote LLM API** in a `many-to-one` mode for a single multi-page document:
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from docling_graph import run_pipeline, PipelineConfig
|
|
186
|
+
from docs.examples.templates.rheology_research import Research # Pydantic model to use as an extraction template
|
|
187
|
+
|
|
188
|
+
# Create typed config
|
|
189
|
+
config = PipelineConfig(
|
|
190
|
+
source="docs/examples/data/research_paper/rheology.pdf",
|
|
191
|
+
template=Research,
|
|
192
|
+
backend="llm",
|
|
193
|
+
inference="remote",
|
|
194
|
+
processing_mode="many-to-one",
|
|
195
|
+
provider_override="mistral", # Specify your preferred provider and ensure its API key is set
|
|
196
|
+
model_override="mistral-medium-latest", # Specify your preferred LLM model
|
|
197
|
+
use_chunking=True, # Enable docling's hybrid chunker
|
|
198
|
+
llm_consolidation=False, # If False, programmatically merge batch-extracted dictionaries
|
|
199
|
+
output_dir="outputs/battery_research"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
run_pipeline(config)
|
|
204
|
+
print(f"\nExtraction complete! Graph data saved to: {config.output_dir}")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
print(f"An error occurred: {e}")
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
### 2. CLI Example
|
|
211
|
+
|
|
212
|
+
Use the command-line interface for quick conversions and inspections. The following command runs the conversion using the local VLM backend and outputs a graph ready for Neo4j import:
|
|
213
|
+
|
|
214
|
+
#### 2.1. Initialize Configuration
|
|
215
|
+
|
|
216
|
+
A wizard will walk you through setting up the right configfor your use case.
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
uv run docling-graph init
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Note: This command may take a little longer to start on the first run, as it checks for installed dependencies.
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
#### 2.2. Run Conversion
|
|
226
|
+
|
|
227
|
+
You can use: `docling-graph convert --help` to see the full list of available options and usage details
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
# uv run docling-graph convert <SOURCE_FILE_PATH> --template "<TEMPLATE_DOTTED_PATH>" [OPTIONS]
|
|
231
|
+
|
|
232
|
+
uv run docling-graph convert "docs/examples/data/research_paper/rheology.pdf" \
|
|
233
|
+
--template "docs.examples.templates.rheology_research.Research" \
|
|
234
|
+
--output-dir "outputs/battery_research" \
|
|
235
|
+
--processing-mode "many-to-one" \
|
|
236
|
+
--use-chunking \
|
|
237
|
+
--no-llm-consolidation
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
#### 2.3. Run Conversion
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
# uv run docling-graph inspect <CONVERT_OUTPUT_PATH> [OPTIONS]
|
|
244
|
+
|
|
245
|
+
uv run docling-graph inspect outputs/battery_research
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
## Pydantic Templates
|
|
251
|
+
|
|
252
|
+
Templates are the foundation of Docling Graph, defining both the **extraction schema** and the resulting **graph structure**.
|
|
253
|
+
|
|
254
|
+
* Use `is_entity=True` in `model_config` to explicitly mark a class as a graph node.
|
|
255
|
+
* Leverage `model_config.graph_id_fields` to create stable, readable node IDs (natural keys).
|
|
256
|
+
* Use the `Edge()` helper to define explicit relationships between entities.
|
|
257
|
+
|
|
258
|
+
**Example:**
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
from pydantic import BaseModel, Field
|
|
262
|
+
from typing import Optional
|
|
263
|
+
|
|
264
|
+
class Person(BaseModel):
|
|
265
|
+
"""Person entity with stable ID based on name and DOB."""
|
|
266
|
+
model_config = {
|
|
267
|
+
'is_entity': True,
|
|
268
|
+
'graph_id_fields': ['last_name', 'date_of_birth']
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
first_name: str = Field(description="Person's first name")
|
|
272
|
+
last_name: str = Field(description="Person's last name")
|
|
273
|
+
date_of_birth: str = Field(description="Date of birth (YYYY-MM-DD)")
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
Reference Pydantic [templates](docs/examples/templates) are available to help you get started quickly.
|
|
277
|
+
|
|
278
|
+
For complete guidance, see: [Pydantic Templates for Knowledge Graph Extraction](docs/guides/create_pydantic_templates_for_kg_extraction.md)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
## Documentation
|
|
283
|
+
|
|
284
|
+
* *Work In Progress...*
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
## Examples
|
|
289
|
+
|
|
290
|
+
Get hands-on with Docling Graph [examples](docs/examples/scripts) to convert documents into knowledge graphs through `VLM` or `LLM`-based processing.
|
|
291
|
+
|
|
292
|
+
## License
|
|
293
|
+
|
|
294
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
## Acknowledgments
|
|
299
|
+
|
|
300
|
+
- Powered by [Docling](https://github.com/docling-project/docling) for advanced document processing.
|
|
301
|
+
- Uses [Pydantic](https://pydantic.dev) for data validation.
|
|
302
|
+
- Graph generation powered by [NetworkX](https://networkx.org/).
|
|
303
|
+
- Visualizations powered by [Cytoscape.js](https://js.cytoscape.org/).
|
|
304
|
+
- CLI powered by [Typer](https://typer.tiangolo.com/) and [Rich](https://github.com/Textualize/rich).
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
## IBM ❤️ Open Source AI
|
|
309
|
+
|
|
310
|
+
Docling Graph has been brought to you by IBM.
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
<p align="center"><br>
|
|
2
|
+
<a href="https://github.com/IBM/docling-graph">
|
|
3
|
+
<img loading="lazy" alt="Docling Graph" src="docs/assets/logo.png" width="280"/>
|
|
4
|
+
</a>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
# Docling Graph
|
|
8
|
+
|
|
9
|
+
[](https://ibm.github.io/docling-graph)
|
|
10
|
+
[](https://github.com/docling-project/docling)
|
|
11
|
+
[](https://pypi.org/project/docling-graph/)
|
|
12
|
+
[](https://www.python.org/downloads/)
|
|
13
|
+
[](https://github.com/astral-sh/uv)
|
|
14
|
+
[](https://github.com/astral-sh/ruff)
|
|
15
|
+
[](https://networkx.org/)
|
|
16
|
+
[](https://pydantic.dev)
|
|
17
|
+
[](https://typer.tiangolo.com/)
|
|
18
|
+
[](https://github.com/Textualize/rich)
|
|
19
|
+
[](https://vllm.ai/)
|
|
20
|
+
[](https://ollama.ai/)
|
|
21
|
+
[](https://lfaidata.foundation/projects/)
|
|
22
|
+
[](https://opensource.org/licenses/MIT)
|
|
23
|
+
[](https://www.bestpractices.dev/projects/11598)
|
|
24
|
+
|
|
25
|
+
Docling-Graph converts documents into validated **Pydantic** objects and then into a **directed knowledge graph**, with exports to CSV or Cypher and both static and interactive visualizations.
|
|
26
|
+
|
|
27
|
+
This transformation of unstructured documents into validated knowledge graphs with precise semantic relationships—essential for complex domains like **chemistry, finance, and physics** where AI systems must understand exact entity connections (e.g., chemical compounds and their reactions, financial instruments and their dependencies, physical properties and their measurements) rather than approximate text vectors, **enabling explainable reasoning over technical document collections**.
|
|
28
|
+
|
|
29
|
+
The toolkit supports two extraction families: **local VLM** via Docling and **LLM-based extraction** via local (vLLM, Ollama) or API providers (Mistral, OpenAI, Gemini, IBM WatsonX), all orchestrated by a flexible, config-driven pipeline.
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
## Key Capabilities
|
|
34
|
+
|
|
35
|
+
- **🧠 Extraction**:
|
|
36
|
+
- Local `VLM` (Docling's information extraction pipeline - ideal for small documents with key-value focus)
|
|
37
|
+
- `LLM` (local via vLLM/Ollama or remote via Mistral/OpenAI/Gemini/IBM WatsonX API)
|
|
38
|
+
- `Hybrid Chunking` Leveraging Docling's segmentation with semantic LLM chunking for more context-aware extraction
|
|
39
|
+
- `Page-wise` or `whole-document` conversion strategies for flexible processing
|
|
40
|
+
- **🔨 Graph Construction**:
|
|
41
|
+
- Markdown to Graph: Convert validated Pydantic instances to a `NetworkX DiGraph` with rich edge metadata and stable node IDs
|
|
42
|
+
- Smart Merge: Combine multi-page documents into a single Pydantic instance for unified processing
|
|
43
|
+
- Modular graph module with enhanced type safety and configuration
|
|
44
|
+
- **📦 Export**:
|
|
45
|
+
- `Docling Document` exports (JSON format with full document structure)
|
|
46
|
+
- `Markdown` exports (full document and per-page options)
|
|
47
|
+
- `CSV` compatible with `Neo4j` admin import
|
|
48
|
+
- `Cypher` script generation for bulk ingestion
|
|
49
|
+
- `JSON` export for general-purpose graph data
|
|
50
|
+
- **📊 Visualization**:
|
|
51
|
+
- Interactive `HTML` visualization in full-page browser view with enhanced node/edge exploration
|
|
52
|
+
- Detailed `MARKDOWN` report with graph nodes content and edges
|
|
53
|
+
|
|
54
|
+
### Coming Soon
|
|
55
|
+
|
|
56
|
+
* 🪜 **Multi-Stage Extraction:** Define `extraction_stage` in templates to control multi-pass extraction.
|
|
57
|
+
* 🧩 **Interactive Template Builder:** Guided workflows for building Pydantic templates.
|
|
58
|
+
* 🧬 **Ontology-Based Templates:** Match content to the best Pydantic template using semantic similarity.
|
|
59
|
+
* ✍🏻 **Flexible Inputs:** Accepts `text`, `markdown`, and `DoclingDocument` directly.
|
|
60
|
+
* ⚡ **Batch Optimization:** Faster GPU inference with better memory handling.
|
|
61
|
+
* 💾 **Graph Database Integration:** Export data straight into `Neo4j`, `ArangoDB`, and similar databases.
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
## Initial Setup
|
|
66
|
+
|
|
67
|
+
### Requirements
|
|
68
|
+
|
|
69
|
+
- Python 3.10 or higher
|
|
70
|
+
- UV package manager
|
|
71
|
+
|
|
72
|
+
### Installation
|
|
73
|
+
|
|
74
|
+
#### 1. Clone the Repository
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
git clone https://github.com/IBM/docling-graph
|
|
78
|
+
cd docling-graph
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
#### 2. Install Dependencies
|
|
82
|
+
|
|
83
|
+
Choose the installation option that matches your use case:
|
|
84
|
+
|
|
85
|
+
| Option | Command | Description |
|
|
86
|
+
| :--- | :--- | :--- |
|
|
87
|
+
| **Minimal** | `uv sync` | Includes core VLM features (Docling), **no** LLM inference |
|
|
88
|
+
| **Full** | `uv sync --extra all` | Includes **all** features, VLM, and all local/remote LLM providers |
|
|
89
|
+
| **Local LLM** | `uv sync --extra local` | Adds support for vLLM and Ollama (requires GPU for vLLM) |
|
|
90
|
+
| **Remote API** | `uv sync --extra remote` | Adds support for Mistral, OpenAI, Gemini, and IBM WatsonX APIs |
|
|
91
|
+
| **WatsonX** | `uv sync --extra watsonx` | Adds support for IBM WatsonX foundation models (Granite, Llama, Mixtral) |
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
#### 3. OPTIONAL - GPU Support (PyTorch)
|
|
95
|
+
|
|
96
|
+
Follow the steps in [this guide](docs/guides/setup_with_gpu_support.md) to install PyTorch with NVIDIA GPU (CUDA) support.
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
### API Key Setup (for Remote Inference)
|
|
101
|
+
|
|
102
|
+
If you're using remote/cloud inference, set your API keys for the providers you plan to use:
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
export OPENAI_API_KEY="..." # OpenAI
|
|
106
|
+
export MISTRAL_API_KEY="..." # Mistral
|
|
107
|
+
export GEMINI_API_KEY="..." # Google Gemini
|
|
108
|
+
export WATSONX_API_KEY="..." # IBM WatsonX
|
|
109
|
+
export WATSONX_PROJECT_ID="..." # IBM WatsonX Project ID
|
|
110
|
+
export WATSONX_URL="..." # IBM WatsonX URL (optional, defaults to US South)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
On Windows, replace `export` with `set` in Command Prompt or `$env:` in PowerShell.
|
|
114
|
+
|
|
115
|
+
Alternatively, add them to your `.env` file.
|
|
116
|
+
|
|
117
|
+
**Note:** For IBM WatsonX setup and available models, see the [WatsonX Integration Guide](docs/guides/watsonx_integration.md).
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
## Getting Started
|
|
122
|
+
|
|
123
|
+
Docling Graph is primarily driven by its **CLI**, but you can easily integrate the core pipeline into Python scripts.
|
|
124
|
+
|
|
125
|
+
### 1. Python Example
|
|
126
|
+
|
|
127
|
+
To run a conversion programmatically, you define a configuration dictionary and pass it to the `run_pipeline` function. This example uses a **remote LLM API** in a `many-to-one` mode for a single multi-page document:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from docling_graph import run_pipeline, PipelineConfig
|
|
131
|
+
from docs.examples.templates.rheology_research import Research # Pydantic model to use as an extraction template
|
|
132
|
+
|
|
133
|
+
# Create typed config
|
|
134
|
+
config = PipelineConfig(
|
|
135
|
+
source="docs/examples/data/research_paper/rheology.pdf",
|
|
136
|
+
template=Research,
|
|
137
|
+
backend="llm",
|
|
138
|
+
inference="remote",
|
|
139
|
+
processing_mode="many-to-one",
|
|
140
|
+
provider_override="mistral", # Specify your preferred provider and ensure its API key is set
|
|
141
|
+
model_override="mistral-medium-latest", # Specify your preferred LLM model
|
|
142
|
+
use_chunking=True, # Enable docling's hybrid chunker
|
|
143
|
+
llm_consolidation=False, # If False, programmatically merge batch-extracted dictionaries
|
|
144
|
+
output_dir="outputs/battery_research"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
run_pipeline(config)
|
|
149
|
+
print(f"\nExtraction complete! Graph data saved to: {config.output_dir}")
|
|
150
|
+
except Exception as e:
|
|
151
|
+
print(f"An error occurred: {e}")
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
### 2. CLI Example
|
|
156
|
+
|
|
157
|
+
Use the command-line interface for quick conversions and inspections. The following command runs the conversion using the local VLM backend and outputs a graph ready for Neo4j import:
|
|
158
|
+
|
|
159
|
+
#### 2.1. Initialize Configuration
|
|
160
|
+
|
|
161
|
+
A wizard will walk you through setting up the right configfor your use case.
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
uv run docling-graph init
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Note: This command may take a little longer to start on the first run, as it checks for installed dependencies.
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
#### 2.2. Run Conversion
|
|
171
|
+
|
|
172
|
+
You can use: `docling-graph convert --help` to see the full list of available options and usage details
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
# uv run docling-graph convert <SOURCE_FILE_PATH> --template "<TEMPLATE_DOTTED_PATH>" [OPTIONS]
|
|
176
|
+
|
|
177
|
+
uv run docling-graph convert "docs/examples/data/research_paper/rheology.pdf" \
|
|
178
|
+
--template "docs.examples.templates.rheology_research.Research" \
|
|
179
|
+
--output-dir "outputs/battery_research" \
|
|
180
|
+
--processing-mode "many-to-one" \
|
|
181
|
+
--use-chunking \
|
|
182
|
+
--no-llm-consolidation
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
#### 2.3. Run Conversion
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
# uv run docling-graph inspect <CONVERT_OUTPUT_PATH> [OPTIONS]
|
|
189
|
+
|
|
190
|
+
uv run docling-graph inspect outputs/battery_research
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
## Pydantic Templates
|
|
196
|
+
|
|
197
|
+
Templates are the foundation of Docling Graph, defining both the **extraction schema** and the resulting **graph structure**.
|
|
198
|
+
|
|
199
|
+
* Use `is_entity=True` in `model_config` to explicitly mark a class as a graph node.
|
|
200
|
+
* Leverage `model_config.graph_id_fields` to create stable, readable node IDs (natural keys).
|
|
201
|
+
* Use the `Edge()` helper to define explicit relationships between entities.
|
|
202
|
+
|
|
203
|
+
**Example:**
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from pydantic import BaseModel, Field
|
|
207
|
+
from typing import Optional
|
|
208
|
+
|
|
209
|
+
class Person(BaseModel):
|
|
210
|
+
"""Person entity with stable ID based on name and DOB."""
|
|
211
|
+
model_config = {
|
|
212
|
+
'is_entity': True,
|
|
213
|
+
'graph_id_fields': ['last_name', 'date_of_birth']
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
first_name: str = Field(description="Person's first name")
|
|
217
|
+
last_name: str = Field(description="Person's last name")
|
|
218
|
+
date_of_birth: str = Field(description="Date of birth (YYYY-MM-DD)")
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Reference Pydantic [templates](docs/examples/templates) are available to help you get started quickly.
|
|
222
|
+
|
|
223
|
+
For complete guidance, see: [Pydantic Templates for Knowledge Graph Extraction](docs/guides/create_pydantic_templates_for_kg_extraction.md)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
## Documentation
|
|
228
|
+
|
|
229
|
+
* *Work In Progress...*
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
## Examples
|
|
234
|
+
|
|
235
|
+
Get hands-on with Docling Graph [examples](docs/examples/scripts) to convert documents into knowledge graphs through `VLM` or `LLM`-based processing.
|
|
236
|
+
|
|
237
|
+
## License
|
|
238
|
+
|
|
239
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
## Acknowledgments
|
|
244
|
+
|
|
245
|
+
- Powered by [Docling](https://github.com/docling-project/docling) for advanced document processing.
|
|
246
|
+
- Uses [Pydantic](https://pydantic.dev) for data validation.
|
|
247
|
+
- Graph generation powered by [NetworkX](https://networkx.org/).
|
|
248
|
+
- Visualizations powered by [Cytoscape.js](https://js.cytoscape.org/).
|
|
249
|
+
- CLI powered by [Typer](https://typer.tiangolo.com/) and [Rich](https://github.com/Textualize/rich).
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
## IBM ❤️ Open Source AI
|
|
254
|
+
|
|
255
|
+
Docling Graph has been brought to you by IBM.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .config import LLMConfig, ModelConfig, ModelsConfig, PipelineConfig, VLMConfig
|
|
2
|
+
from .pipeline import run_pipeline
|
|
3
|
+
|
|
4
|
+
__version__ = "0.2.4"
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"LLMConfig",
|
|
8
|
+
"ModelConfig",
|
|
9
|
+
"ModelsConfig",
|
|
10
|
+
"PipelineConfig",
|
|
11
|
+
"VLMConfig",
|
|
12
|
+
"__version__",
|
|
13
|
+
"run_pipeline",
|
|
14
|
+
]
|
|
File without changes
|