docling-graph 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. docling_graph/__init__.py +14 -0
  2. docling_graph/cli/__init__.py +0 -0
  3. docling_graph/cli/commands/__init__.py +8 -0
  4. docling_graph/cli/commands/convert.py +212 -0
  5. docling_graph/cli/commands/init.py +85 -0
  6. docling_graph/cli/commands/inspect.py +117 -0
  7. docling_graph/cli/config_builder.py +330 -0
  8. docling_graph/cli/config_utils.py +84 -0
  9. docling_graph/cli/constants.py +49 -0
  10. docling_graph/cli/dependencies.py +213 -0
  11. docling_graph/cli/main.py +37 -0
  12. docling_graph/cli/validators.py +226 -0
  13. docling_graph/config.py +191 -0
  14. docling_graph/core/__init__.py +34 -0
  15. docling_graph/core/converters/__init__.py +0 -0
  16. docling_graph/core/converters/config.py +48 -0
  17. docling_graph/core/converters/graph_converter.py +278 -0
  18. docling_graph/core/converters/models.py +48 -0
  19. docling_graph/core/converters/node_id_registry.py +142 -0
  20. docling_graph/core/exporters/__init__.py +8 -0
  21. docling_graph/core/exporters/base.py +31 -0
  22. docling_graph/core/exporters/csv_exporter.py +88 -0
  23. docling_graph/core/exporters/cypher_exporter.py +170 -0
  24. docling_graph/core/exporters/docling_exporter.py +101 -0
  25. docling_graph/core/exporters/json_exporter.py +87 -0
  26. docling_graph/core/extractors/__init__.py +4 -0
  27. docling_graph/core/extractors/backends/__init__.py +0 -0
  28. docling_graph/core/extractors/backends/llm_backend.py +182 -0
  29. docling_graph/core/extractors/backends/vlm_backend.py +150 -0
  30. docling_graph/core/extractors/chunk_batcher.py +256 -0
  31. docling_graph/core/extractors/document_chunker.py +250 -0
  32. docling_graph/core/extractors/document_processor.py +276 -0
  33. docling_graph/core/extractors/extractor_base.py +27 -0
  34. docling_graph/core/extractors/factory.py +96 -0
  35. docling_graph/core/extractors/strategies/__init__.py +0 -0
  36. docling_graph/core/extractors/strategies/many_to_one.py +450 -0
  37. docling_graph/core/extractors/strategies/one_to_one.py +114 -0
  38. docling_graph/core/utils/__init__.py +0 -0
  39. docling_graph/core/utils/dict_merger.py +144 -0
  40. docling_graph/core/utils/graph_cleaner.py +298 -0
  41. docling_graph/core/utils/stats_calculator.py +71 -0
  42. docling_graph/core/utils/string_formatter.py +94 -0
  43. docling_graph/core/visualizers/__init__.py +6 -0
  44. docling_graph/core/visualizers/base.py +32 -0
  45. docling_graph/core/visualizers/interactive_visualizer.py +324 -0
  46. docling_graph/core/visualizers/report_generator.py +153 -0
  47. docling_graph/db_clients/__init__.py +0 -0
  48. docling_graph/llm_clients/__init__.py +128 -0
  49. docling_graph/llm_clients/base.py +38 -0
  50. docling_graph/llm_clients/config.py +547 -0
  51. docling_graph/llm_clients/gemini.py +125 -0
  52. docling_graph/llm_clients/mistral.py +141 -0
  53. docling_graph/llm_clients/ollama.py +122 -0
  54. docling_graph/llm_clients/openai.py +124 -0
  55. docling_graph/llm_clients/prompts.py +155 -0
  56. docling_graph/llm_clients/vllm.py +142 -0
  57. docling_graph/llm_clients/watsonx.py +214 -0
  58. docling_graph/pipeline.py +283 -0
  59. docling_graph/protocols.py +260 -0
  60. docling_graph-0.2.4.dist-info/METADATA +310 -0
  61. docling_graph-0.2.4.dist-info/RECORD +65 -0
  62. docling_graph-0.2.4.dist-info/WHEEL +5 -0
  63. docling_graph-0.2.4.dist-info/entry_points.txt +2 -0
  64. docling_graph-0.2.4.dist-info/licenses/LICENSE +21 -0
  65. docling_graph-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,14 @@
1
+ from .config import LLMConfig, ModelConfig, ModelsConfig, PipelineConfig, VLMConfig
2
+ from .pipeline import run_pipeline
3
+
4
+ __version__ = "0.2.4"
5
+
6
+ __all__ = [
7
+ "LLMConfig",
8
+ "ModelConfig",
9
+ "ModelsConfig",
10
+ "PipelineConfig",
11
+ "VLMConfig",
12
+ "__version__",
13
+ "run_pipeline",
14
+ ]
File without changes
@@ -0,0 +1,8 @@
1
+ """
2
+ CLI commands package.
3
+ """
4
+
5
+ from .convert import convert_command
6
+ from .init import init_command
7
+
8
+ __all__ = ["convert_command", "init_command"]
@@ -0,0 +1,212 @@
1
+ """
2
+ Convert command - converts documents to knowledge graphs.
3
+ """
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ import typer
10
+ from rich import print as rich_print
11
+ from typing_extensions import Annotated
12
+
13
+ sys.path.append(str(Path.cwd()))
14
+
15
+ from docling_graph.config import PipelineConfig
16
+ from docling_graph.pipeline import run_pipeline
17
+
18
+ from ..config_utils import load_config
19
+ from ..validators import (
20
+ validate_backend_type,
21
+ validate_docling_config,
22
+ validate_export_format,
23
+ validate_inference,
24
+ validate_processing_mode,
25
+ validate_vlm_constraints,
26
+ )
27
+
28
+
29
+ def convert_command(
30
+ source: Annotated[
31
+ Path,
32
+ typer.Argument(
33
+ help="Path to the source document (PDF, JPG, PNG).",
34
+ exists=True,
35
+ file_okay=True,
36
+ dir_okay=False,
37
+ readable=True,
38
+ ),
39
+ ],
40
+ template: Annotated[
41
+ str,
42
+ typer.Option(
43
+ "--template",
44
+ "-t",
45
+ help="Dotted path to Pydantic template (e.g., 'templates.invoice.Invoice').",
46
+ ),
47
+ ],
48
+ processing_mode: Annotated[
49
+ str | None,
50
+ typer.Option(
51
+ "--processing-mode", "-p", help="Processing strategy: 'one-to-one' or 'many-to-one'."
52
+ ),
53
+ ] = None,
54
+ backend: Annotated[
55
+ str | None, typer.Option("--backend", "-b", help="Backend: 'llm' or 'vlm'.")
56
+ ] = None,
57
+ inference: Annotated[
58
+ str | None, typer.Option("--inference", "-i", help="Inference: 'local' or 'remote'.")
59
+ ] = None,
60
+ docling_pipeline: Annotated[
61
+ str | None,
62
+ typer.Option("--docling-pipeline", "-d", help="Docling pipeline: 'ocr' or 'vision'."),
63
+ ] = None,
64
+ # Extraction options
65
+ llm_consolidation: Annotated[
66
+ bool | None,
67
+ typer.Option(
68
+ "--llm-consolidation/--no-llm-consolidation",
69
+ help="Enable/disable final LLM consolidation step.",
70
+ ),
71
+ ] = None,
72
+ use_chunking: Annotated[
73
+ bool | None,
74
+ typer.Option(
75
+ "--use-chunking/--no-use-chunking",
76
+ help="Enable/disable document chunking.",
77
+ ),
78
+ ] = None,
79
+ # Docling export options
80
+ export_docling_json: Annotated[
81
+ bool,
82
+ typer.Option(
83
+ "--export-docling-json/--no-docling-json", help="Export Docling document as JSON."
84
+ ),
85
+ ] = True,
86
+ export_markdown: Annotated[
87
+ bool, typer.Option("--export-markdown/--no-markdown", help="Export full document markdown.")
88
+ ] = True,
89
+ export_per_page: Annotated[
90
+ bool,
91
+ typer.Option("--export-per-page/--no-per-page", help="Export per-page markdown files."),
92
+ ] = False,
93
+ # Output options
94
+ output_dir: Annotated[
95
+ Path,
96
+ typer.Option(
97
+ "--output-dir", "-o", help="Output directory.", file_okay=False, writable=True
98
+ ),
99
+ ] = Path("outputs"),
100
+ model: Annotated[str | None, typer.Option("--model", "-m", help="Override model name.")] = None,
101
+ provider: Annotated[str | None, typer.Option("--provider", help="Override provider.")] = None,
102
+ export_format: Annotated[
103
+ str | None,
104
+ typer.Option("--export-format", "-e", help="Export format: 'csv' or 'cypher'."),
105
+ ] = None,
106
+ reverse_edges: Annotated[
107
+ bool, typer.Option("--reverse-edges", "-r", help="Create bidirectional edges.")
108
+ ] = False,
109
+ ) -> None:
110
+ """Convert a document to a knowledge graph."""
111
+ rich_print("--- [blue]Docling-Graph Conversion[/blue] ---")
112
+
113
+ # Load YAML configuration (flat)
114
+ config_data = load_config()
115
+ defaults = config_data.get("defaults", {})
116
+ docling_cfg = config_data.get("docling", {})
117
+ models_from_yaml = config_data.get("models", {}) # flat models only
118
+
119
+ # Resolve configuration (CLI args override config file)
120
+ processing_mode_val = processing_mode or defaults.get("processing_mode", "many-to-one")
121
+ backend_val = backend or defaults.get("backend", "llm")
122
+ inference_val = inference or defaults.get("inference", "local")
123
+ export_format_val = export_format or defaults.get("export_format", "csv")
124
+
125
+ # Docling settings
126
+ docling_pipeline_val = docling_pipeline or docling_cfg.get("pipeline", "ocr")
127
+
128
+ # Resolve extraction settings
129
+ final_llm_consolidation = (
130
+ llm_consolidation
131
+ if llm_consolidation is not None
132
+ else defaults.get("llm_consolidation", True)
133
+ )
134
+ final_use_chunking = (
135
+ use_chunking if use_chunking is not None else defaults.get("use_chunking", True)
136
+ )
137
+
138
+ # Docling export settings - use config file as fallback
139
+ docling_export_settings = docling_cfg.get("export", {})
140
+ final_export_docling_json = (
141
+ export_docling_json
142
+ if export_docling_json is not None
143
+ else docling_export_settings.get("docling_json", True)
144
+ )
145
+ final_export_markdown = (
146
+ export_markdown
147
+ if export_markdown is not None
148
+ else docling_export_settings.get("markdown", True)
149
+ )
150
+ final_export_per_page = (
151
+ export_per_page
152
+ if export_per_page is not None
153
+ else docling_export_settings.get("per_page_markdown", False)
154
+ )
155
+
156
+ # Validate all inputs
157
+ processing_mode_val = validate_processing_mode(processing_mode_val)
158
+ backend_val = validate_backend_type(backend_val)
159
+ inference_val = validate_inference(inference_val)
160
+ docling_pipeline_val = validate_docling_config(docling_pipeline_val)
161
+ export_format_val = validate_export_format(export_format_val)
162
+ validate_vlm_constraints(backend_val, inference_val)
163
+
164
+ # Display configuration
165
+ rich_print("\n[bold]Configuration:[/bold]")
166
+ rich_print(f" • Source: [cyan]{source}[/cyan]")
167
+ rich_print(f" • Template: [cyan]{template}[/cyan]")
168
+ rich_print(f" • Docling Pipeline: [cyan]{docling_pipeline_val}[/cyan]")
169
+ rich_print(f" • Processing: [cyan]{processing_mode_val}[/cyan]")
170
+ rich_print(f" • Backend: [cyan]{backend_val}[/cyan]")
171
+ rich_print(f" • Inference: [cyan]{inference_val}[/cyan]")
172
+ rich_print(f" • Export: [cyan]{export_format_val}[/cyan]")
173
+ rich_print(f" • Reverse edges: [cyan]{reverse_edges}[/cyan]")
174
+
175
+ # Display Extraction settings
176
+ rich_print("\n[bold]Extraction Settings:[/bold]")
177
+ rich_print(f" • LLM Consolidation: [cyan]{final_llm_consolidation}[/cyan]")
178
+ rich_print(f" • Use Chunking: [cyan]{final_use_chunking}[/cyan]")
179
+
180
+ # Display Docling export settings
181
+ rich_print("\n[bold]Docling Export:[/bold]")
182
+ rich_print(f" • Document JSON: [cyan]{final_export_docling_json}[/cyan]")
183
+ rich_print(f" • Markdown: [cyan]{final_export_markdown}[/cyan]")
184
+ rich_print(f" • Per-page MD: [cyan]{final_export_per_page}[/cyan]")
185
+
186
+ # Build typed config
187
+ cfg = PipelineConfig(
188
+ source=str(source),
189
+ template=template,
190
+ backend=backend_val,
191
+ inference=inference_val,
192
+ processing_mode=processing_mode_val,
193
+ docling_config=docling_pipeline_val,
194
+ model_override=model,
195
+ provider_override=provider,
196
+ models=models_from_yaml,
197
+ llm_consolidation=final_llm_consolidation,
198
+ use_chunking=final_use_chunking,
199
+ export_format=export_format_val,
200
+ export_docling=True,
201
+ export_docling_json=final_export_docling_json,
202
+ export_markdown=final_export_markdown,
203
+ export_per_page_markdown=final_export_per_page,
204
+ reverse_edges=reverse_edges,
205
+ output_dir=str(output_dir),
206
+ )
207
+
208
+ # Run pipeline with normalized/validated config
209
+ try:
210
+ run_pipeline(cfg)
211
+ except Exception as e:
212
+ raise ValueError(str(e)) from e
@@ -0,0 +1,85 @@
1
+ """
2
+ Init command - creates configuration file interactively.
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+ import typer
8
+ from rich import print as rich_print
9
+
10
+ from docling_graph.config import PipelineConfig
11
+
12
+ from ..config_builder import build_config_interactive, print_next_steps
13
+ from ..config_utils import save_config
14
+ from ..constants import CONFIG_FILE_NAME
15
+ from ..validators import (
16
+ print_next_steps_with_deps,
17
+ validate_and_warn_dependencies,
18
+ )
19
+
20
+
21
+ def init_command() -> None:
22
+ """Create a customized configuration file through interactive prompts."""
23
+ output_path = Path.cwd() / CONFIG_FILE_NAME
24
+
25
+ # Check if config already exists
26
+ if output_path.exists():
27
+ rich_print(f"[yellow]A configuration file: '{CONFIG_FILE_NAME}' already exists.[/yellow]")
28
+ if not typer.confirm("Overwrite it?"):
29
+ rich_print("Initialization cancelled.")
30
+ return
31
+
32
+ # Build configuration
33
+ config_dict = _build_config_safe()
34
+ if config_dict is None:
35
+ raise typer.Exit(code=1)
36
+
37
+ # Validate dependencies
38
+ rich_print("\n[bold cyan]Validating dependencies...[/bold cyan]")
39
+ deps_valid = validate_and_warn_dependencies(config_dict)
40
+
41
+ # Save configuration
42
+ if not _save_config_safe(config_dict, output_path):
43
+ raise typer.Exit(code=1)
44
+
45
+ # Print next steps (consolidated logic handles dependency installation)
46
+ _print_final_steps(config_dict, deps_valid)
47
+
48
+
49
+ def _build_config_safe() -> dict | None:
50
+ """Safely build configuration with fallback to defaults."""
51
+ try:
52
+ return build_config_interactive()
53
+ except (EOFError, KeyboardInterrupt, typer.Abort):
54
+ rich_print("[yellow]Interactive mode not available. Using default configuration.[/yellow]")
55
+ config = PipelineConfig.generate_yaml_dict()
56
+ rich_print("[blue]Loaded default configuration.[/blue]")
57
+ return config
58
+ except Exception as err:
59
+ rich_print(f"[red]Error creating config: {err}[/red]")
60
+ return None
61
+
62
+
63
+ def _save_config_safe(config_dict: dict, output_path: Path) -> bool:
64
+ """Safely save configuration file."""
65
+ try:
66
+ save_config(config_dict, output_path)
67
+ rich_print(f"[green]Config successfully initiated at: {output_path}[/green]")
68
+ return True
69
+ except Exception as err:
70
+ rich_print(f"[red]Error saving config: {err}[/red]")
71
+ return False
72
+
73
+
74
+ def _print_final_steps(config_dict: dict, deps_valid: bool) -> None:
75
+ """Print final next steps, handling dependency installation if needed."""
76
+ next_steps = print_next_steps(config_dict, return_text=True)
77
+
78
+ if deps_valid:
79
+ # Dependencies are already installed, just print steps
80
+ rich_print(next_steps)
81
+ else:
82
+ if next_steps is None:
83
+ next_steps = ""
84
+ # Dependencies missing, use the function that prepends install step
85
+ print_next_steps_with_deps(config_dict, next_steps)
@@ -0,0 +1,117 @@
1
+ """
2
+ Inspect command - visualizes graph data in browser.
3
+ """
4
+
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ import typer
9
+ from rich import print as rich_print
10
+ from typing_extensions import Annotated
11
+
12
+ from ...core.visualizers.interactive_visualizer import InteractiveVisualizer
13
+
14
+
15
+ def inspect_command(
16
+ path: Annotated[
17
+ Path,
18
+ typer.Argument(
19
+ help="Path to graph data. For CSV: directory with nodes.csv and edges.csv. For JSON: path to .json file.",
20
+ exists=True,
21
+ ),
22
+ ],
23
+ input_format: Annotated[
24
+ str, typer.Option("--format", "-f", help="Import format: 'csv' or 'json'.")
25
+ ] = "csv",
26
+ output: Annotated[
27
+ Path | None,
28
+ typer.Option(
29
+ "--output", "-o", help="Output HTML file path. If not specified, uses temporary file."
30
+ ),
31
+ ] = None,
32
+ open_browser: Annotated[
33
+ bool, typer.Option("--open/--no-open", help="Automatically open browser.")
34
+ ] = True,
35
+ ) -> None:
36
+ """
37
+ Visualize graph data in the browser.
38
+
39
+ This command creates an interactive HTML visualization that opens
40
+ in your default web browser. The HTML file is self-contained and
41
+ can be shared or saved for later viewing.
42
+
43
+ Examples:
44
+ # Visualize CSV format (default) - opens in browser
45
+ docling-graph inspect ./output_dir
46
+
47
+ # Visualize JSON format
48
+ docling-graph inspect graph.json --format json
49
+
50
+ # Save to specific location
51
+ docling-graph inspect ./output_dir --output graph_viz.html
52
+
53
+ # Create HTML without opening browser
54
+ docling-graph inspect ./output_dir --no-open --output viz.html
55
+ """
56
+
57
+ # Validate format
58
+ input_format = input_format.lower()
59
+ if input_format not in ["csv", "json"]:
60
+ rich_print(
61
+ f"[bold red]Error:[/bold red] Format must be 'csv' or 'json', got '{input_format}'"
62
+ )
63
+ raise typer.Exit(code=1)
64
+
65
+ # Validate path based on format
66
+ if input_format == "csv":
67
+ if not path.is_dir():
68
+ rich_print(
69
+ "[bold red]Error:[/bold red] For CSV format, path must be a directory containing nodes.csv and edges.csv"
70
+ )
71
+ raise typer.Exit(code=1)
72
+
73
+ nodes_path = path / "nodes.csv"
74
+ edges_path = path / "edges.csv"
75
+
76
+ if not nodes_path.exists():
77
+ rich_print(f"[bold red]Error:[/bold red] nodes.csv not found in {path}")
78
+ raise typer.Exit(code=1)
79
+
80
+ if not edges_path.exists():
81
+ rich_print(f"[bold red]Error:[/bold red] edges.csv not found in {path}")
82
+ raise typer.Exit(code=1)
83
+
84
+ elif input_format == "json":
85
+ if not path.is_file() or path.suffix != ".json":
86
+ rich_print("[bold red]Error:[/bold red] For JSON format, path must be a .json file")
87
+ raise typer.Exit(code=1)
88
+
89
+ rich_print("--- [blue]Starting Docling-Graph Inspection[/blue] ---")
90
+ rich_print("\n[bold]Interactive Visualization[/bold]")
91
+ rich_print(f" Input: [cyan]{path}[/cyan]")
92
+ rich_print(f" Format: [cyan]{input_format}[/cyan]")
93
+ if output:
94
+ rich_print(f" Output: [cyan]{output}[/cyan]")
95
+ else:
96
+ rich_print(" Output: [cyan]temporary file[/cyan]")
97
+
98
+ try:
99
+ # Create visualizer
100
+ visualizer = InteractiveVisualizer()
101
+
102
+ # Load and visualize
103
+ rich_print("\nLoading graph data...")
104
+ visualizer.display_cytoscape_graph(
105
+ path=path, input_format=input_format, output_path=output, open_browser=open_browser
106
+ )
107
+
108
+ rich_print("--- [blue]Docling-Graph Inspection Finished Successfully[/blue] ---")
109
+
110
+ if not open_browser:
111
+ rich_print(
112
+ "\n[blue]Tip:[/blue] Open the HTML file in your browser to view the visualization"
113
+ )
114
+
115
+ except Exception as e:
116
+ rich_print(f"[bold red]Error:[/bold red] {type(e).__name__}: {e}")
117
+ return None