docling-graph 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. docling_graph-0.2.4/LICENSE +21 -0
  2. docling_graph-0.2.4/PKG-INFO +310 -0
  3. docling_graph-0.2.4/README.md +255 -0
  4. docling_graph-0.2.4/docling_graph/__init__.py +14 -0
  5. docling_graph-0.2.4/docling_graph/cli/__init__.py +0 -0
  6. docling_graph-0.2.4/docling_graph/cli/commands/__init__.py +8 -0
  7. docling_graph-0.2.4/docling_graph/cli/commands/convert.py +212 -0
  8. docling_graph-0.2.4/docling_graph/cli/commands/init.py +85 -0
  9. docling_graph-0.2.4/docling_graph/cli/commands/inspect.py +117 -0
  10. docling_graph-0.2.4/docling_graph/cli/config_builder.py +330 -0
  11. docling_graph-0.2.4/docling_graph/cli/config_utils.py +84 -0
  12. docling_graph-0.2.4/docling_graph/cli/constants.py +49 -0
  13. docling_graph-0.2.4/docling_graph/cli/dependencies.py +213 -0
  14. docling_graph-0.2.4/docling_graph/cli/main.py +37 -0
  15. docling_graph-0.2.4/docling_graph/cli/validators.py +226 -0
  16. docling_graph-0.2.4/docling_graph/config.py +191 -0
  17. docling_graph-0.2.4/docling_graph/core/__init__.py +34 -0
  18. docling_graph-0.2.4/docling_graph/core/converters/__init__.py +0 -0
  19. docling_graph-0.2.4/docling_graph/core/converters/config.py +48 -0
  20. docling_graph-0.2.4/docling_graph/core/converters/graph_converter.py +278 -0
  21. docling_graph-0.2.4/docling_graph/core/converters/models.py +48 -0
  22. docling_graph-0.2.4/docling_graph/core/converters/node_id_registry.py +142 -0
  23. docling_graph-0.2.4/docling_graph/core/exporters/__init__.py +8 -0
  24. docling_graph-0.2.4/docling_graph/core/exporters/base.py +31 -0
  25. docling_graph-0.2.4/docling_graph/core/exporters/csv_exporter.py +88 -0
  26. docling_graph-0.2.4/docling_graph/core/exporters/cypher_exporter.py +170 -0
  27. docling_graph-0.2.4/docling_graph/core/exporters/docling_exporter.py +101 -0
  28. docling_graph-0.2.4/docling_graph/core/exporters/json_exporter.py +87 -0
  29. docling_graph-0.2.4/docling_graph/core/extractors/__init__.py +4 -0
  30. docling_graph-0.2.4/docling_graph/core/extractors/backends/__init__.py +0 -0
  31. docling_graph-0.2.4/docling_graph/core/extractors/backends/llm_backend.py +182 -0
  32. docling_graph-0.2.4/docling_graph/core/extractors/backends/vlm_backend.py +150 -0
  33. docling_graph-0.2.4/docling_graph/core/extractors/chunk_batcher.py +256 -0
  34. docling_graph-0.2.4/docling_graph/core/extractors/document_chunker.py +250 -0
  35. docling_graph-0.2.4/docling_graph/core/extractors/document_processor.py +276 -0
  36. docling_graph-0.2.4/docling_graph/core/extractors/extractor_base.py +27 -0
  37. docling_graph-0.2.4/docling_graph/core/extractors/factory.py +96 -0
  38. docling_graph-0.2.4/docling_graph/core/extractors/strategies/__init__.py +0 -0
  39. docling_graph-0.2.4/docling_graph/core/extractors/strategies/many_to_one.py +450 -0
  40. docling_graph-0.2.4/docling_graph/core/extractors/strategies/one_to_one.py +114 -0
  41. docling_graph-0.2.4/docling_graph/core/utils/__init__.py +0 -0
  42. docling_graph-0.2.4/docling_graph/core/utils/dict_merger.py +144 -0
  43. docling_graph-0.2.4/docling_graph/core/utils/graph_cleaner.py +298 -0
  44. docling_graph-0.2.4/docling_graph/core/utils/stats_calculator.py +71 -0
  45. docling_graph-0.2.4/docling_graph/core/utils/string_formatter.py +94 -0
  46. docling_graph-0.2.4/docling_graph/core/visualizers/__init__.py +6 -0
  47. docling_graph-0.2.4/docling_graph/core/visualizers/base.py +32 -0
  48. docling_graph-0.2.4/docling_graph/core/visualizers/interactive_visualizer.py +324 -0
  49. docling_graph-0.2.4/docling_graph/core/visualizers/report_generator.py +153 -0
  50. docling_graph-0.2.4/docling_graph/db_clients/__init__.py +0 -0
  51. docling_graph-0.2.4/docling_graph/llm_clients/__init__.py +128 -0
  52. docling_graph-0.2.4/docling_graph/llm_clients/base.py +38 -0
  53. docling_graph-0.2.4/docling_graph/llm_clients/config.py +547 -0
  54. docling_graph-0.2.4/docling_graph/llm_clients/gemini.py +125 -0
  55. docling_graph-0.2.4/docling_graph/llm_clients/mistral.py +141 -0
  56. docling_graph-0.2.4/docling_graph/llm_clients/ollama.py +122 -0
  57. docling_graph-0.2.4/docling_graph/llm_clients/openai.py +124 -0
  58. docling_graph-0.2.4/docling_graph/llm_clients/prompts.py +155 -0
  59. docling_graph-0.2.4/docling_graph/llm_clients/vllm.py +142 -0
  60. docling_graph-0.2.4/docling_graph/llm_clients/watsonx.py +214 -0
  61. docling_graph-0.2.4/docling_graph/pipeline.py +283 -0
  62. docling_graph-0.2.4/docling_graph/protocols.py +260 -0
  63. docling_graph-0.2.4/docling_graph.egg-info/PKG-INFO +310 -0
  64. docling_graph-0.2.4/docling_graph.egg-info/SOURCES.txt +68 -0
  65. docling_graph-0.2.4/docling_graph.egg-info/dependency_links.txt +1 -0
  66. docling_graph-0.2.4/docling_graph.egg-info/entry_points.txt +2 -0
  67. docling_graph-0.2.4/docling_graph.egg-info/requires.txt +39 -0
  68. docling_graph-0.2.4/docling_graph.egg-info/top_level.txt +1 -0
  69. docling_graph-0.2.4/pyproject.toml +258 -0
  70. docling_graph-0.2.4/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 International Business Machines
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,310 @@
1
+ Metadata-Version: 2.4
2
+ Name: docling_graph
3
+ Version: 0.2.4
4
+ Summary: A tool to convert documents into knowledge graphs using Docling.
5
+ Author-email: Michele Dolfi <dol@zurich.ibm.com>, Ayoub El Bouchtili <ayoub.elbouchtili@fr.ibm.com>, Maxime Gillot <Maxime.Gillot@ibm.com>, Sophie Lang <sophie.lang@de.ibm.com>, Guilhaume Leroy Meline <guilhaume@fr.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
+ License: MIT License
7
+ Project-URL: homepage, https://github.com/ayoub-ibm/docling-graph
8
+ Project-URL: repository, https://github.com/ayoub-ibm/docling-graph
9
+ Project-URL: issues, https://github.com/ayoub-ibm/docling-graph/issues
10
+ Project-URL: changelog, https://github.com/ayoub-ibm/docling-graph/blob/main/CHANGELOG.md
11
+ Keywords: docling,knowledge-graph,nlp,pdf,graph
12
+ Classifier: Operating System :: MacOS :: MacOS X
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: Operating System :: Microsoft :: Windows
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Requires-Python: <4.0,>=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: docling[vlm]<3.0.0,>=2.60.0
25
+ Requires-Dist: docling-core[chunking,chunking-openai]<3.0.0,>=2.50.0
26
+ Requires-Dist: pydantic<3.0.0,>=2.0.0
27
+ Requires-Dist: networkx<4.0.0,>=3.0.0
28
+ Requires-Dist: rich<15,>=13
29
+ Requires-Dist: typer[all]<1.0.0,>=0.12
30
+ Requires-Dist: python-dotenv<2.0,>=1.0
31
+ Provides-Extra: ollama
32
+ Requires-Dist: ollama; extra == "ollama"
33
+ Provides-Extra: vllm
34
+ Requires-Dist: openai; extra == "vllm"
35
+ Provides-Extra: gemini
36
+ Requires-Dist: google-genai; extra == "gemini"
37
+ Provides-Extra: mistral
38
+ Requires-Dist: mistralai; extra == "mistral"
39
+ Provides-Extra: openai
40
+ Requires-Dist: openai; extra == "openai"
41
+ Provides-Extra: watsonx
42
+ Requires-Dist: ibm-watsonx-ai; extra == "watsonx"
43
+ Provides-Extra: local
44
+ Requires-Dist: docling-graph[ollama]; extra == "local"
45
+ Requires-Dist: docling-graph[vllm]; extra == "local"
46
+ Provides-Extra: remote
47
+ Requires-Dist: docling-graph[openai]; extra == "remote"
48
+ Requires-Dist: docling-graph[mistral]; extra == "remote"
49
+ Requires-Dist: docling-graph[gemini]; extra == "remote"
50
+ Requires-Dist: docling-graph[watsonx]; extra == "remote"
51
+ Provides-Extra: all
52
+ Requires-Dist: docling-graph[local]; extra == "all"
53
+ Requires-Dist: docling-graph[remote]; extra == "all"
54
+ Dynamic: license-file
55
+
56
+ <p align="center"><br>
57
+ <a href="https://github.com/IBM/docling-graph">
58
+ <img loading="lazy" alt="Docling Graph" src="docs/assets/logo.png" width="280"/>
59
+ </a>
60
+ </p>
61
+
62
+ # Docling Graph
63
+
64
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ibm.github.io/docling-graph)
65
+ [![Docling](https://img.shields.io/badge/Docling-VLM-red)](https://github.com/docling-project/docling)
66
+ [![PyPI version](https://img.shields.io/pypi/v/docling-graph)](https://pypi.org/project/docling-graph/)
67
+ [![Python 3.10 | 3.11 | 3.12](https://img.shields.io/badge/Python-3.10%20%7C%203.11%20%7C%203.12-blue)](https://www.python.org/downloads/)
68
+ [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
69
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
70
+ [![NetworkX](https://img.shields.io/badge/NetworkX-3.0+-red)](https://networkx.org/)
71
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
72
+ [![Typer](https://img.shields.io/badge/Typer-CLI-purple)](https://typer.tiangolo.com/)
73
+ [![Rich](https://img.shields.io/badge/Rich-terminal-purple)](https://github.com/Textualize/rich)
74
+ [![vLLM](https://img.shields.io/badge/vLLM-compatible-brightgreen)](https://vllm.ai/)
75
+ [![Ollama](https://img.shields.io/badge/Ollama-compatible-brightgreen)](https://ollama.ai/)
76
+ [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
77
+ [![License MIT](https://img.shields.io/github/license/IBM/docling-graph)](https://opensource.org/licenses/MIT)
78
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/11598/badge)](https://www.bestpractices.dev/projects/11598)
79
+
80
+ Docling-Graph converts documents into validated **Pydantic** objects and then into a **directed knowledge graph**, with exports to CSV or Cypher and both static and interactive visualizations.
81
+
82
+ This transformation of unstructured documents into validated knowledge graphs with precise semantic relationships—essential for complex domains like **chemistry, finance, and physics** where AI systems must understand exact entity connections (e.g., chemical compounds and their reactions, financial instruments and their dependencies, physical properties and their measurements) rather than approximate text vectors, **enabling explainable reasoning over technical document collections**.
83
+
84
+ The toolkit supports two extraction families: **local VLM** via Docling and **LLM-based extraction** via local (vLLM, Ollama) or API providers (Mistral, OpenAI, Gemini, IBM WatsonX), all orchestrated by a flexible, config-driven pipeline.
85
+
86
+
87
+
88
+ ## Key Capabilities
89
+
90
+ - **🧠 Extraction**:
91
+ - Local `VLM` (Docling's information extraction pipeline - ideal for small documents with key-value focus)
92
+ - `LLM` (local via vLLM/Ollama or remote via Mistral/OpenAI/Gemini/IBM WatsonX API)
93
+ - `Hybrid Chunking` Leveraging Docling's segmentation with semantic LLM chunking for more context-aware extraction
94
+ - `Page-wise` or `whole-document` conversion strategies for flexible processing
95
+ - **🔨 Graph Construction**:
96
+ - Markdown to Graph: Convert validated Pydantic instances to a `NetworkX DiGraph` with rich edge metadata and stable node IDs
97
+ - Smart Merge: Combine multi-page documents into a single Pydantic instance for unified processing
98
+ - Modular graph module with enhanced type safety and configuration
99
+ - **📦 Export**:
100
+ - `Docling Document` exports (JSON format with full document structure)
101
+ - `Markdown` exports (full document and per-page options)
102
+ - `CSV` compatible with `Neo4j` admin import
103
+ - `Cypher` script generation for bulk ingestion
104
+ - `JSON` export for general-purpose graph data
105
+ - **📊 Visualization**:
106
+ - Interactive `HTML` visualization in full-page browser view with enhanced node/edge exploration
107
+ - Detailed `MARKDOWN` report with graph nodes content and edges
108
+
109
+ ### Coming Soon
110
+
111
+ * 🪜 **Multi-Stage Extraction:** Define `extraction_stage` in templates to control multi-pass extraction.
112
+ * 🧩 **Interactive Template Builder:** Guided workflows for building Pydantic templates.
113
+ * 🧬 **Ontology-Based Templates:** Match content to the best Pydantic template using semantic similarity.
114
+ * ✍🏻 **Flexible Inputs:** Accepts `text`, `markdown`, and `DoclingDocument` directly.
115
+ * ⚡ **Batch Optimization:** Faster GPU inference with better memory handling.
116
+ * 💾 **Graph Database Integration:** Export data straight into `Neo4j`, `ArangoDB`, and similar databases.
117
+
118
+
119
+
120
+ ## Initial Setup
121
+
122
+ ### Requirements
123
+
124
+ - Python 3.10 or higher
125
+ - UV package manager
126
+
127
+ ### Installation
128
+
129
+ #### 1. Clone the Repository
130
+
131
+ ```bash
132
+ git clone https://github.com/IBM/docling-graph
133
+ cd docling-graph
134
+ ```
135
+
136
+ #### 2. Install Dependencies
137
+
138
+ Choose the installation option that matches your use case:
139
+
140
+ | Option | Command | Description |
141
+ | :--- | :--- | :--- |
142
+ | **Minimal** | `uv sync` | Includes core VLM features (Docling), **no** LLM inference |
143
+ | **Full** | `uv sync --extra all` | Includes **all** features, VLM, and all local/remote LLM providers |
144
+ | **Local LLM** | `uv sync --extra local` | Adds support for vLLM and Ollama (requires GPU for vLLM) |
145
+ | **Remote API** | `uv sync --extra remote` | Adds support for Mistral, OpenAI, Gemini, and IBM WatsonX APIs |
146
+ | **WatsonX** | `uv sync --extra watsonx` | Adds support for IBM WatsonX foundation models (Granite, Llama, Mixtral) |
147
+
148
+
149
+ #### 3. OPTIONAL - GPU Support (PyTorch)
150
+
151
+ Follow the steps in [this guide](docs/guides/setup_with_gpu_support.md) to install PyTorch with NVIDIA GPU (CUDA) support.
152
+
153
+
154
+
155
+ ### API Key Setup (for Remote Inference)
156
+
157
+ If you're using remote/cloud inference, set your API keys for the providers you plan to use:
158
+
159
+ ```bash
160
+ export OPENAI_API_KEY="..." # OpenAI
161
+ export MISTRAL_API_KEY="..." # Mistral
162
+ export GEMINI_API_KEY="..." # Google Gemini
163
+ export WATSONX_API_KEY="..." # IBM WatsonX
164
+ export WATSONX_PROJECT_ID="..." # IBM WatsonX Project ID
165
+ export WATSONX_URL="..." # IBM WatsonX URL (optional, defaults to US South)
166
+ ```
167
+
168
+ On Windows, replace `export` with `set` in Command Prompt or `$env:` in PowerShell.
169
+
170
+ Alternatively, add them to your `.env` file.
171
+
172
+ **Note:** For IBM WatsonX setup and available models, see the [WatsonX Integration Guide](docs/guides/watsonx_integration.md).
173
+
174
+
175
+
176
+ ## Getting Started
177
+
178
+ Docling Graph is primarily driven by its **CLI**, but you can easily integrate the core pipeline into Python scripts.
179
+
180
+ ### 1. Python Example
181
+
182
+ To run a conversion programmatically, you define a configuration dictionary and pass it to the `run_pipeline` function. This example uses a **remote LLM API** in a `many-to-one` mode for a single multi-page document:
183
+
184
+ ```python
185
+ from docling_graph import run_pipeline, PipelineConfig
186
+ from docs.examples.templates.rheology_research import Research # Pydantic model to use as an extraction template
187
+
188
+ # Create typed config
189
+ config = PipelineConfig(
190
+ source="docs/examples/data/research_paper/rheology.pdf",
191
+ template=Research,
192
+ backend="llm",
193
+ inference="remote",
194
+ processing_mode="many-to-one",
195
+ provider_override="mistral", # Specify your preferred provider and ensure its API key is set
196
+ model_override="mistral-medium-latest", # Specify your preferred LLM model
197
+ use_chunking=True, # Enable docling's hybrid chunker
198
+ llm_consolidation=False, # If False, programmatically merge batch-extracted dictionaries
199
+ output_dir="outputs/battery_research"
200
+ )
201
+
202
+ try:
203
+ run_pipeline(config)
204
+ print(f"\nExtraction complete! Graph data saved to: {config.output_dir}")
205
+ except Exception as e:
206
+ print(f"An error occurred: {e}")
207
+ ```
208
+
209
+
210
+ ### 2. CLI Example
211
+
212
+ Use the command-line interface for quick conversions and inspections. The following command runs the conversion using the local VLM backend and outputs a graph ready for Neo4j import:
213
+
214
+ #### 2.1. Initialize Configuration
215
+
216
+ A wizard will walk you through setting up the right configfor your use case.
217
+
218
+ ```bash
219
+ uv run docling-graph init
220
+ ```
221
+
222
+ Note: This command may take a little longer to start on the first run, as it checks for installed dependencies.
223
+
224
+
225
+ #### 2.2. Run Conversion
226
+
227
+ You can use: `docling-graph convert --help` to see the full list of available options and usage details
228
+
229
+ ```bash
230
+ # uv run docling-graph convert <SOURCE_FILE_PATH> --template "<TEMPLATE_DOTTED_PATH>" [OPTIONS]
231
+
232
+ uv run docling-graph convert "docs/examples/data/research_paper/rheology.pdf" \
233
+ --template "docs.examples.templates.rheology_research.Research" \
234
+ --output-dir "outputs/battery_research" \
235
+ --processing-mode "many-to-one" \
236
+ --use-chunking \
237
+ --no-llm-consolidation
238
+ ```
239
+
240
+ #### 2.3. Run Conversion
241
+
242
+ ```bash
243
+ # uv run docling-graph inspect <CONVERT_OUTPUT_PATH> [OPTIONS]
244
+
245
+ uv run docling-graph inspect outputs/battery_research
246
+ ```
247
+
248
+
249
+
250
+ ## Pydantic Templates
251
+
252
+ Templates are the foundation of Docling Graph, defining both the **extraction schema** and the resulting **graph structure**.
253
+
254
+ * Use `is_entity=True` in `model_config` to explicitly mark a class as a graph node.
255
+ * Leverage `model_config.graph_id_fields` to create stable, readable node IDs (natural keys).
256
+ * Use the `Edge()` helper to define explicit relationships between entities.
257
+
258
+ **Example:**
259
+
260
+ ```python
261
+ from pydantic import BaseModel, Field
262
+ from typing import Optional
263
+
264
+ class Person(BaseModel):
265
+ """Person entity with stable ID based on name and DOB."""
266
+ model_config = {
267
+ 'is_entity': True,
268
+ 'graph_id_fields': ['last_name', 'date_of_birth']
269
+ }
270
+
271
+ first_name: str = Field(description="Person's first name")
272
+ last_name: str = Field(description="Person's last name")
273
+ date_of_birth: str = Field(description="Date of birth (YYYY-MM-DD)")
274
+ ```
275
+
276
+ Reference Pydantic [templates](docs/examples/templates) are available to help you get started quickly.
277
+
278
+ For complete guidance, see: [Pydantic Templates for Knowledge Graph Extraction](docs/guides/create_pydantic_templates_for_kg_extraction.md)
279
+
280
+
281
+
282
+ ## Documentation
283
+
284
+ * *Work In Progress...*
285
+
286
+
287
+
288
+ ## Examples
289
+
290
+ Get hands-on with Docling Graph [examples](docs/examples/scripts) to convert documents into knowledge graphs through `VLM` or `LLM`-based processing.
291
+
292
+ ## License
293
+
294
+ MIT License - see [LICENSE](LICENSE) for details.
295
+
296
+
297
+
298
+ ## Acknowledgments
299
+
300
+ - Powered by [Docling](https://github.com/docling-project/docling) for advanced document processing.
301
+ - Uses [Pydantic](https://pydantic.dev) for data validation.
302
+ - Graph generation powered by [NetworkX](https://networkx.org/).
303
+ - Visualizations powered by [Cytoscape.js](https://js.cytoscape.org/).
304
+ - CLI powered by [Typer](https://typer.tiangolo.com/) and [Rich](https://github.com/Textualize/rich).
305
+
306
+
307
+
308
+ ## IBM ❤️ Open Source AI
309
+
310
+ Docling Graph has been brought to you by IBM.
@@ -0,0 +1,255 @@
1
+ <p align="center"><br>
2
+ <a href="https://github.com/IBM/docling-graph">
3
+ <img loading="lazy" alt="Docling Graph" src="docs/assets/logo.png" width="280"/>
4
+ </a>
5
+ </p>
6
+
7
+ # Docling Graph
8
+
9
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ibm.github.io/docling-graph)
10
+ [![Docling](https://img.shields.io/badge/Docling-VLM-red)](https://github.com/docling-project/docling)
11
+ [![PyPI version](https://img.shields.io/pypi/v/docling-graph)](https://pypi.org/project/docling-graph/)
12
+ [![Python 3.10 | 3.11 | 3.12](https://img.shields.io/badge/Python-3.10%20%7C%203.11%20%7C%203.12-blue)](https://www.python.org/downloads/)
13
+ [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
14
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
15
+ [![NetworkX](https://img.shields.io/badge/NetworkX-3.0+-red)](https://networkx.org/)
16
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
17
+ [![Typer](https://img.shields.io/badge/Typer-CLI-purple)](https://typer.tiangolo.com/)
18
+ [![Rich](https://img.shields.io/badge/Rich-terminal-purple)](https://github.com/Textualize/rich)
19
+ [![vLLM](https://img.shields.io/badge/vLLM-compatible-brightgreen)](https://vllm.ai/)
20
+ [![Ollama](https://img.shields.io/badge/Ollama-compatible-brightgreen)](https://ollama.ai/)
21
+ [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
22
+ [![License MIT](https://img.shields.io/github/license/IBM/docling-graph)](https://opensource.org/licenses/MIT)
23
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/11598/badge)](https://www.bestpractices.dev/projects/11598)
24
+
25
+ Docling-Graph converts documents into validated **Pydantic** objects and then into a **directed knowledge graph**, with exports to CSV or Cypher and both static and interactive visualizations.
26
+
27
+ This transformation of unstructured documents into validated knowledge graphs with precise semantic relationships—essential for complex domains like **chemistry, finance, and physics** where AI systems must understand exact entity connections (e.g., chemical compounds and their reactions, financial instruments and their dependencies, physical properties and their measurements) rather than approximate text vectors, **enabling explainable reasoning over technical document collections**.
28
+
29
+ The toolkit supports two extraction families: **local VLM** via Docling and **LLM-based extraction** via local (vLLM, Ollama) or API providers (Mistral, OpenAI, Gemini, IBM WatsonX), all orchestrated by a flexible, config-driven pipeline.
30
+
31
+
32
+
33
+ ## Key Capabilities
34
+
35
+ - **🧠 Extraction**:
36
+ - Local `VLM` (Docling's information extraction pipeline - ideal for small documents with key-value focus)
37
+ - `LLM` (local via vLLM/Ollama or remote via Mistral/OpenAI/Gemini/IBM WatsonX API)
38
+ - `Hybrid Chunking` Leveraging Docling's segmentation with semantic LLM chunking for more context-aware extraction
39
+ - `Page-wise` or `whole-document` conversion strategies for flexible processing
40
+ - **🔨 Graph Construction**:
41
+ - Markdown to Graph: Convert validated Pydantic instances to a `NetworkX DiGraph` with rich edge metadata and stable node IDs
42
+ - Smart Merge: Combine multi-page documents into a single Pydantic instance for unified processing
43
+ - Modular graph module with enhanced type safety and configuration
44
+ - **📦 Export**:
45
+ - `Docling Document` exports (JSON format with full document structure)
46
+ - `Markdown` exports (full document and per-page options)
47
+ - `CSV` compatible with `Neo4j` admin import
48
+ - `Cypher` script generation for bulk ingestion
49
+ - `JSON` export for general-purpose graph data
50
+ - **📊 Visualization**:
51
+ - Interactive `HTML` visualization in full-page browser view with enhanced node/edge exploration
52
+ - Detailed `MARKDOWN` report with graph nodes content and edges
53
+
54
+ ### Coming Soon
55
+
56
+ * 🪜 **Multi-Stage Extraction:** Define `extraction_stage` in templates to control multi-pass extraction.
57
+ * 🧩 **Interactive Template Builder:** Guided workflows for building Pydantic templates.
58
+ * 🧬 **Ontology-Based Templates:** Match content to the best Pydantic template using semantic similarity.
59
+ * ✍🏻 **Flexible Inputs:** Accepts `text`, `markdown`, and `DoclingDocument` directly.
60
+ * ⚡ **Batch Optimization:** Faster GPU inference with better memory handling.
61
+ * 💾 **Graph Database Integration:** Export data straight into `Neo4j`, `ArangoDB`, and similar databases.
62
+
63
+
64
+
65
+ ## Initial Setup
66
+
67
+ ### Requirements
68
+
69
+ - Python 3.10 or higher
70
+ - UV package manager
71
+
72
+ ### Installation
73
+
74
+ #### 1. Clone the Repository
75
+
76
+ ```bash
77
+ git clone https://github.com/IBM/docling-graph
78
+ cd docling-graph
79
+ ```
80
+
81
+ #### 2. Install Dependencies
82
+
83
+ Choose the installation option that matches your use case:
84
+
85
+ | Option | Command | Description |
86
+ | :--- | :--- | :--- |
87
+ | **Minimal** | `uv sync` | Includes core VLM features (Docling), **no** LLM inference |
88
+ | **Full** | `uv sync --extra all` | Includes **all** features, VLM, and all local/remote LLM providers |
89
+ | **Local LLM** | `uv sync --extra local` | Adds support for vLLM and Ollama (requires GPU for vLLM) |
90
+ | **Remote API** | `uv sync --extra remote` | Adds support for Mistral, OpenAI, Gemini, and IBM WatsonX APIs |
91
+ | **WatsonX** | `uv sync --extra watsonx` | Adds support for IBM WatsonX foundation models (Granite, Llama, Mixtral) |
92
+
93
+
94
+ #### 3. OPTIONAL - GPU Support (PyTorch)
95
+
96
+ Follow the steps in [this guide](docs/guides/setup_with_gpu_support.md) to install PyTorch with NVIDIA GPU (CUDA) support.
97
+
98
+
99
+
100
+ ### API Key Setup (for Remote Inference)
101
+
102
+ If you're using remote/cloud inference, set your API keys for the providers you plan to use:
103
+
104
+ ```bash
105
+ export OPENAI_API_KEY="..." # OpenAI
106
+ export MISTRAL_API_KEY="..." # Mistral
107
+ export GEMINI_API_KEY="..." # Google Gemini
108
+ export WATSONX_API_KEY="..." # IBM WatsonX
109
+ export WATSONX_PROJECT_ID="..." # IBM WatsonX Project ID
110
+ export WATSONX_URL="..." # IBM WatsonX URL (optional, defaults to US South)
111
+ ```
112
+
113
+ On Windows, replace `export` with `set` in Command Prompt or `$env:` in PowerShell.
114
+
115
+ Alternatively, add them to your `.env` file.
116
+
117
+ **Note:** For IBM WatsonX setup and available models, see the [WatsonX Integration Guide](docs/guides/watsonx_integration.md).
118
+
119
+
120
+
121
+ ## Getting Started
122
+
123
+ Docling Graph is primarily driven by its **CLI**, but you can easily integrate the core pipeline into Python scripts.
124
+
125
+ ### 1. Python Example
126
+
127
+ To run a conversion programmatically, you define a configuration dictionary and pass it to the `run_pipeline` function. This example uses a **remote LLM API** in a `many-to-one` mode for a single multi-page document:
128
+
129
+ ```python
130
+ from docling_graph import run_pipeline, PipelineConfig
131
+ from docs.examples.templates.rheology_research import Research # Pydantic model to use as an extraction template
132
+
133
+ # Create typed config
134
+ config = PipelineConfig(
135
+ source="docs/examples/data/research_paper/rheology.pdf",
136
+ template=Research,
137
+ backend="llm",
138
+ inference="remote",
139
+ processing_mode="many-to-one",
140
+ provider_override="mistral", # Specify your preferred provider and ensure its API key is set
141
+ model_override="mistral-medium-latest", # Specify your preferred LLM model
142
+ use_chunking=True, # Enable docling's hybrid chunker
143
+ llm_consolidation=False, # If False, programmatically merge batch-extracted dictionaries
144
+ output_dir="outputs/battery_research"
145
+ )
146
+
147
+ try:
148
+ run_pipeline(config)
149
+ print(f"\nExtraction complete! Graph data saved to: {config.output_dir}")
150
+ except Exception as e:
151
+ print(f"An error occurred: {e}")
152
+ ```
153
+
154
+
155
+ ### 2. CLI Example
156
+
157
+ Use the command-line interface for quick conversions and inspections. The following command runs the conversion using the local VLM backend and outputs a graph ready for Neo4j import:
158
+
159
+ #### 2.1. Initialize Configuration
160
+
161
+ A wizard will walk you through setting up the right configfor your use case.
162
+
163
+ ```bash
164
+ uv run docling-graph init
165
+ ```
166
+
167
+ Note: This command may take a little longer to start on the first run, as it checks for installed dependencies.
168
+
169
+
170
+ #### 2.2. Run Conversion
171
+
172
+ You can use: `docling-graph convert --help` to see the full list of available options and usage details
173
+
174
+ ```bash
175
+ # uv run docling-graph convert <SOURCE_FILE_PATH> --template "<TEMPLATE_DOTTED_PATH>" [OPTIONS]
176
+
177
+ uv run docling-graph convert "docs/examples/data/research_paper/rheology.pdf" \
178
+ --template "docs.examples.templates.rheology_research.Research" \
179
+ --output-dir "outputs/battery_research" \
180
+ --processing-mode "many-to-one" \
181
+ --use-chunking \
182
+ --no-llm-consolidation
183
+ ```
184
+
185
+ #### 2.3. Run Conversion
186
+
187
+ ```bash
188
+ # uv run docling-graph inspect <CONVERT_OUTPUT_PATH> [OPTIONS]
189
+
190
+ uv run docling-graph inspect outputs/battery_research
191
+ ```
192
+
193
+
194
+
195
+ ## Pydantic Templates
196
+
197
+ Templates are the foundation of Docling Graph, defining both the **extraction schema** and the resulting **graph structure**.
198
+
199
+ * Use `is_entity=True` in `model_config` to explicitly mark a class as a graph node.
200
+ * Leverage `model_config.graph_id_fields` to create stable, readable node IDs (natural keys).
201
+ * Use the `Edge()` helper to define explicit relationships between entities.
202
+
203
+ **Example:**
204
+
205
+ ```python
206
+ from pydantic import BaseModel, Field
207
+ from typing import Optional
208
+
209
+ class Person(BaseModel):
210
+ """Person entity with stable ID based on name and DOB."""
211
+ model_config = {
212
+ 'is_entity': True,
213
+ 'graph_id_fields': ['last_name', 'date_of_birth']
214
+ }
215
+
216
+ first_name: str = Field(description="Person's first name")
217
+ last_name: str = Field(description="Person's last name")
218
+ date_of_birth: str = Field(description="Date of birth (YYYY-MM-DD)")
219
+ ```
220
+
221
+ Reference Pydantic [templates](docs/examples/templates) are available to help you get started quickly.
222
+
223
+ For complete guidance, see: [Pydantic Templates for Knowledge Graph Extraction](docs/guides/create_pydantic_templates_for_kg_extraction.md)
224
+
225
+
226
+
227
+ ## Documentation
228
+
229
+ * *Work In Progress...*
230
+
231
+
232
+
233
+ ## Examples
234
+
235
+ Get hands-on with Docling Graph [examples](docs/examples/scripts) to convert documents into knowledge graphs through `VLM` or `LLM`-based processing.
236
+
237
+ ## License
238
+
239
+ MIT License - see [LICENSE](LICENSE) for details.
240
+
241
+
242
+
243
+ ## Acknowledgments
244
+
245
+ - Powered by [Docling](https://github.com/docling-project/docling) for advanced document processing.
246
+ - Uses [Pydantic](https://pydantic.dev) for data validation.
247
+ - Graph generation powered by [NetworkX](https://networkx.org/).
248
+ - Visualizations powered by [Cytoscape.js](https://js.cytoscape.org/).
249
+ - CLI powered by [Typer](https://typer.tiangolo.com/) and [Rich](https://github.com/Textualize/rich).
250
+
251
+
252
+
253
+ ## IBM ❤️ Open Source AI
254
+
255
+ Docling Graph has been brought to you by IBM.
@@ -0,0 +1,14 @@
1
+ from .config import LLMConfig, ModelConfig, ModelsConfig, PipelineConfig, VLMConfig
2
+ from .pipeline import run_pipeline
3
+
4
+ __version__ = "0.2.4"
5
+
6
+ __all__ = [
7
+ "LLMConfig",
8
+ "ModelConfig",
9
+ "ModelsConfig",
10
+ "PipelineConfig",
11
+ "VLMConfig",
12
+ "__version__",
13
+ "run_pipeline",
14
+ ]
File without changes
@@ -0,0 +1,8 @@
1
+ """
2
+ CLI commands package.
3
+ """
4
+
5
+ from .convert import convert_command
6
+ from .init import init_command
7
+
8
+ __all__ = ["convert_command", "init_command"]