PyPI - docling-graph - Versions diffs - 0.2.4__py3-none-any.whl - Mend

docling-graph 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

docling_graph/__init__.py +14 -0
docling_graph/cli/__init__.py +0 -0
docling_graph/cli/commands/__init__.py +8 -0
docling_graph/cli/commands/convert.py +212 -0
docling_graph/cli/commands/init.py +85 -0
docling_graph/cli/commands/inspect.py +117 -0
docling_graph/cli/config_builder.py +330 -0
docling_graph/cli/config_utils.py +84 -0
docling_graph/cli/constants.py +49 -0
docling_graph/cli/dependencies.py +213 -0
docling_graph/cli/main.py +37 -0
docling_graph/cli/validators.py +226 -0
docling_graph/config.py +191 -0
docling_graph/core/__init__.py +34 -0
docling_graph/core/converters/__init__.py +0 -0
docling_graph/core/converters/config.py +48 -0
docling_graph/core/converters/graph_converter.py +278 -0
docling_graph/core/converters/models.py +48 -0
docling_graph/core/converters/node_id_registry.py +142 -0
docling_graph/core/exporters/__init__.py +8 -0
docling_graph/core/exporters/base.py +31 -0
docling_graph/core/exporters/csv_exporter.py +88 -0
docling_graph/core/exporters/cypher_exporter.py +170 -0
docling_graph/core/exporters/docling_exporter.py +101 -0
docling_graph/core/exporters/json_exporter.py +87 -0
docling_graph/core/extractors/__init__.py +4 -0
docling_graph/core/extractors/backends/__init__.py +0 -0
docling_graph/core/extractors/backends/llm_backend.py +182 -0
docling_graph/core/extractors/backends/vlm_backend.py +150 -0
docling_graph/core/extractors/chunk_batcher.py +256 -0
docling_graph/core/extractors/document_chunker.py +250 -0
docling_graph/core/extractors/document_processor.py +276 -0
docling_graph/core/extractors/extractor_base.py +27 -0
docling_graph/core/extractors/factory.py +96 -0
docling_graph/core/extractors/strategies/__init__.py +0 -0
docling_graph/core/extractors/strategies/many_to_one.py +450 -0
docling_graph/core/extractors/strategies/one_to_one.py +114 -0
docling_graph/core/utils/__init__.py +0 -0
docling_graph/core/utils/dict_merger.py +144 -0
docling_graph/core/utils/graph_cleaner.py +298 -0
docling_graph/core/utils/stats_calculator.py +71 -0
docling_graph/core/utils/string_formatter.py +94 -0
docling_graph/core/visualizers/__init__.py +6 -0
docling_graph/core/visualizers/base.py +32 -0
docling_graph/core/visualizers/interactive_visualizer.py +324 -0
docling_graph/core/visualizers/report_generator.py +153 -0
docling_graph/db_clients/__init__.py +0 -0
docling_graph/llm_clients/__init__.py +128 -0
docling_graph/llm_clients/base.py +38 -0
docling_graph/llm_clients/config.py +547 -0
docling_graph/llm_clients/gemini.py +125 -0
docling_graph/llm_clients/mistral.py +141 -0
docling_graph/llm_clients/ollama.py +122 -0
docling_graph/llm_clients/openai.py +124 -0
docling_graph/llm_clients/prompts.py +155 -0
docling_graph/llm_clients/vllm.py +142 -0
docling_graph/llm_clients/watsonx.py +214 -0
docling_graph/pipeline.py +283 -0
docling_graph/protocols.py +260 -0
docling_graph-0.2.4.dist-info/METADATA +310 -0
docling_graph-0.2.4.dist-info/RECORD +65 -0
docling_graph-0.2.4.dist-info/WHEEL +5 -0
docling_graph-0.2.4.dist-info/entry_points.txt +2 -0
docling_graph-0.2.4.dist-info/licenses/LICENSE +21 -0
docling_graph-0.2.4.dist-info/top_level.txt +1 -0

docling_graph/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .config import LLMConfig, ModelConfig, ModelsConfig, PipelineConfig, VLMConfig
+from .pipeline import run_pipeline
+__version__ = "0.2.4"
+__all__ = [
+    "LLMConfig",
+    "ModelConfig",
+    "ModelsConfig",
+    "PipelineConfig",
+    "VLMConfig",
+    "__version__",
+    "run_pipeline",
+]

docling_graph/cli/__init__.py ADDED Viewed

File without changes

docling_graph/cli/commands/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""
+CLI commands package.
+"""
+from .convert import convert_command
+from .init import init_command
+__all__ = ["convert_command", "init_command"]

docling_graph/cli/commands/convert.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""
+Convert command - converts documents to knowledge graphs.
+"""
+import sys
+from pathlib import Path
+from typing import Optional
+import typer
+from rich import print as rich_print
+from typing_extensions import Annotated
+sys.path.append(str(Path.cwd()))
+from docling_graph.config import PipelineConfig
+from docling_graph.pipeline import run_pipeline
+from ..config_utils import load_config
+from ..validators import (
+    validate_backend_type,
+    validate_docling_config,
+    validate_export_format,
+    validate_inference,
+    validate_processing_mode,
+    validate_vlm_constraints,
+)
+def convert_command(
+    source: Annotated[
+        Path,
+        typer.Argument(
+            help="Path to the source document (PDF, JPG, PNG).",
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ],
+    template: Annotated[
+        str,
+        typer.Option(
+            "--template",
+            "-t",
+            help="Dotted path to Pydantic template (e.g., 'templates.invoice.Invoice').",
+        ),
+    ],
+    processing_mode: Annotated[
+        str | None,
+        typer.Option(
+            "--processing-mode", "-p", help="Processing strategy: 'one-to-one' or 'many-to-one'."
+        ),
+    ] = None,
+    backend: Annotated[
+        str | None, typer.Option("--backend", "-b", help="Backend: 'llm' or 'vlm'.")
+    ] = None,
+    inference: Annotated[
+        str | None, typer.Option("--inference", "-i", help="Inference: 'local' or 'remote'.")
+    ] = None,
+    docling_pipeline: Annotated[
+        str | None,
+        typer.Option("--docling-pipeline", "-d", help="Docling pipeline: 'ocr' or 'vision'."),
+    ] = None,
+    # Extraction options
+    llm_consolidation: Annotated[
+        bool | None,
+        typer.Option(
+            "--llm-consolidation/--no-llm-consolidation",
+            help="Enable/disable final LLM consolidation step.",
+        ),
+    ] = None,
+    use_chunking: Annotated[
+        bool | None,
+        typer.Option(
+            "--use-chunking/--no-use-chunking",
+            help="Enable/disable document chunking.",
+        ),
+    ] = None,
+    # Docling export options
+    export_docling_json: Annotated[
+        bool,
+        typer.Option(
+            "--export-docling-json/--no-docling-json", help="Export Docling document as JSON."
+        ),
+    ] = True,
+    export_markdown: Annotated[
+        bool, typer.Option("--export-markdown/--no-markdown", help="Export full document markdown.")
+    ] = True,
+    export_per_page: Annotated[
+        bool,
+        typer.Option("--export-per-page/--no-per-page", help="Export per-page markdown files."),
+    ] = False,
+    # Output options
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            "--output-dir", "-o", help="Output directory.", file_okay=False, writable=True
+        ),
+    ] = Path("outputs"),
+    model: Annotated[str | None, typer.Option("--model", "-m", help="Override model name.")] = None,
+    provider: Annotated[str | None, typer.Option("--provider", help="Override provider.")] = None,
+    export_format: Annotated[
+        str | None,
+        typer.Option("--export-format", "-e", help="Export format: 'csv' or 'cypher'."),
+    ] = None,
+    reverse_edges: Annotated[
+        bool, typer.Option("--reverse-edges", "-r", help="Create bidirectional edges.")
+    ] = False,
+) -> None:
+    """Convert a document to a knowledge graph."""
+    rich_print("--- [blue]Docling-Graph Conversion[/blue] ---")
+    # Load YAML configuration (flat)
+    config_data = load_config()
+    defaults = config_data.get("defaults", {})
+    docling_cfg = config_data.get("docling", {})
+    models_from_yaml = config_data.get("models", {})  # flat models only
+    # Resolve configuration (CLI args override config file)
+    processing_mode_val = processing_mode or defaults.get("processing_mode", "many-to-one")
+    backend_val = backend or defaults.get("backend", "llm")
+    inference_val = inference or defaults.get("inference", "local")
+    export_format_val = export_format or defaults.get("export_format", "csv")
+    # Docling settings
+    docling_pipeline_val = docling_pipeline or docling_cfg.get("pipeline", "ocr")
+    # Resolve extraction settings
+    final_llm_consolidation = (
+        llm_consolidation
+        if llm_consolidation is not None
+        else defaults.get("llm_consolidation", True)
+    )
+    final_use_chunking = (
+        use_chunking if use_chunking is not None else defaults.get("use_chunking", True)
+    )
+    # Docling export settings - use config file as fallback
+    docling_export_settings = docling_cfg.get("export", {})
+    final_export_docling_json = (
+        export_docling_json
+        if export_docling_json is not None
+        else docling_export_settings.get("docling_json", True)
+    )
+    final_export_markdown = (
+        export_markdown
+        if export_markdown is not None
+        else docling_export_settings.get("markdown", True)
+    )
+    final_export_per_page = (
+        export_per_page
+        if export_per_page is not None
+        else docling_export_settings.get("per_page_markdown", False)
+    )
+    # Validate all inputs
+    processing_mode_val = validate_processing_mode(processing_mode_val)
+    backend_val = validate_backend_type(backend_val)
+    inference_val = validate_inference(inference_val)
+    docling_pipeline_val = validate_docling_config(docling_pipeline_val)
+    export_format_val = validate_export_format(export_format_val)
+    validate_vlm_constraints(backend_val, inference_val)
+    # Display configuration
+    rich_print("\n[bold]Configuration:[/bold]")
+    rich_print(f" • Source: [cyan]{source}[/cyan]")
+    rich_print(f" • Template: [cyan]{template}[/cyan]")
+    rich_print(f" • Docling Pipeline: [cyan]{docling_pipeline_val}[/cyan]")
+    rich_print(f" • Processing: [cyan]{processing_mode_val}[/cyan]")
+    rich_print(f" • Backend: [cyan]{backend_val}[/cyan]")
+    rich_print(f" • Inference: [cyan]{inference_val}[/cyan]")
+    rich_print(f" • Export: [cyan]{export_format_val}[/cyan]")
+    rich_print(f" • Reverse edges: [cyan]{reverse_edges}[/cyan]")
+    # Display Extraction settings
+    rich_print("\n[bold]Extraction Settings:[/bold]")
+    rich_print(f" • LLM Consolidation: [cyan]{final_llm_consolidation}[/cyan]")
+    rich_print(f" • Use Chunking: [cyan]{final_use_chunking}[/cyan]")
+    # Display Docling export settings
+    rich_print("\n[bold]Docling Export:[/bold]")
+    rich_print(f" • Document JSON: [cyan]{final_export_docling_json}[/cyan]")
+    rich_print(f" • Markdown: [cyan]{final_export_markdown}[/cyan]")
+    rich_print(f" • Per-page MD: [cyan]{final_export_per_page}[/cyan]")
+    # Build typed config
+    cfg = PipelineConfig(
+        source=str(source),
+        template=template,
+        backend=backend_val,
+        inference=inference_val,
+        processing_mode=processing_mode_val,
+        docling_config=docling_pipeline_val,
+        model_override=model,
+        provider_override=provider,
+        models=models_from_yaml,
+        llm_consolidation=final_llm_consolidation,
+        use_chunking=final_use_chunking,
+        export_format=export_format_val,
+        export_docling=True,
+        export_docling_json=final_export_docling_json,
+        export_markdown=final_export_markdown,
+        export_per_page_markdown=final_export_per_page,
+        reverse_edges=reverse_edges,
+        output_dir=str(output_dir),
+    )
+    # Run pipeline with normalized/validated config
+    try:
+        run_pipeline(cfg)
+    except Exception as e:
+        raise ValueError(str(e)) from e

docling_graph/cli/commands/init.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""
+Init command - creates configuration file interactively.
+"""
+from pathlib import Path
+import typer
+from rich import print as rich_print
+from docling_graph.config import PipelineConfig
+from ..config_builder import build_config_interactive, print_next_steps
+from ..config_utils import save_config
+from ..constants import CONFIG_FILE_NAME
+from ..validators import (
+    print_next_steps_with_deps,
+    validate_and_warn_dependencies,
+)
+def init_command() -> None:
+    """Create a customized configuration file through interactive prompts."""
+    output_path = Path.cwd() / CONFIG_FILE_NAME
+    # Check if config already exists
+    if output_path.exists():
+        rich_print(f"[yellow]A configuration file: '{CONFIG_FILE_NAME}' already exists.[/yellow]")
+        if not typer.confirm("Overwrite it?"):
+            rich_print("Initialization cancelled.")
+            return
+    # Build configuration
+    config_dict = _build_config_safe()
+    if config_dict is None:
+        raise typer.Exit(code=1)
+    # Validate dependencies
+    rich_print("\n[bold cyan]Validating dependencies...[/bold cyan]")
+    deps_valid = validate_and_warn_dependencies(config_dict)
+    # Save configuration
+    if not _save_config_safe(config_dict, output_path):
+        raise typer.Exit(code=1)
+    # Print next steps (consolidated logic handles dependency installation)
+    _print_final_steps(config_dict, deps_valid)
+def _build_config_safe() -> dict | None:
+    """Safely build configuration with fallback to defaults."""
+    try:
+        return build_config_interactive()
+    except (EOFError, KeyboardInterrupt, typer.Abort):
+        rich_print("[yellow]Interactive mode not available. Using default configuration.[/yellow]")
+        config = PipelineConfig.generate_yaml_dict()
+        rich_print("[blue]Loaded default configuration.[/blue]")
+        return config
+    except Exception as err:
+        rich_print(f"[red]Error creating config: {err}[/red]")
+        return None
+def _save_config_safe(config_dict: dict, output_path: Path) -> bool:
+    """Safely save configuration file."""
+    try:
+        save_config(config_dict, output_path)
+        rich_print(f"[green]Config successfully initiated at: {output_path}[/green]")
+        return True
+    except Exception as err:
+        rich_print(f"[red]Error saving config: {err}[/red]")
+        return False
+def _print_final_steps(config_dict: dict, deps_valid: bool) -> None:
+    """Print final next steps, handling dependency installation if needed."""
+    next_steps = print_next_steps(config_dict, return_text=True)
+    if deps_valid:
+        # Dependencies are already installed, just print steps
+        rich_print(next_steps)
+    else:
+        if next_steps is None:
+            next_steps = ""
+        # Dependencies missing, use the function that prepends install step
+        print_next_steps_with_deps(config_dict, next_steps)

docling_graph/cli/commands/inspect.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""
+Inspect command - visualizes graph data in browser.
+"""
+from pathlib import Path
+from typing import Optional
+import typer
+from rich import print as rich_print
+from typing_extensions import Annotated
+from ...core.visualizers.interactive_visualizer import InteractiveVisualizer
+def inspect_command(
+    path: Annotated[
+        Path,
+        typer.Argument(
+            help="Path to graph data. For CSV: directory with nodes.csv and edges.csv. For JSON: path to .json file.",
+            exists=True,
+        ),
+    ],
+    input_format: Annotated[
+        str, typer.Option("--format", "-f", help="Import format: 'csv' or 'json'.")
+    ] = "csv",
+    output: Annotated[
+        Path | None,
+        typer.Option(
+            "--output", "-o", help="Output HTML file path. If not specified, uses temporary file."
+        ),
+    ] = None,
+    open_browser: Annotated[
+        bool, typer.Option("--open/--no-open", help="Automatically open browser.")
+    ] = True,
+) -> None:
+    """
+    Visualize graph data in the browser.
+    This command creates an interactive HTML visualization that opens
+    in your default web browser. The HTML file is self-contained and
+    can be shared or saved for later viewing.
+    Examples:
+        # Visualize CSV format (default) - opens in browser
+        docling-graph inspect ./output_dir
+        # Visualize JSON format
+        docling-graph inspect graph.json --format json
+        # Save to specific location
+        docling-graph inspect ./output_dir --output graph_viz.html
+        # Create HTML without opening browser
+        docling-graph inspect ./output_dir --no-open --output viz.html
+    """
+    # Validate format
+    input_format = input_format.lower()
+    if input_format not in ["csv", "json"]:
+        rich_print(
+            f"[bold red]Error:[/bold red] Format must be 'csv' or 'json', got '{input_format}'"
+        )
+        raise typer.Exit(code=1)
+    # Validate path based on format
+    if input_format == "csv":
+        if not path.is_dir():
+            rich_print(
+                "[bold red]Error:[/bold red] For CSV format, path must be a directory containing nodes.csv and edges.csv"
+            )
+            raise typer.Exit(code=1)
+        nodes_path = path / "nodes.csv"
+        edges_path = path / "edges.csv"
+        if not nodes_path.exists():
+            rich_print(f"[bold red]Error:[/bold red] nodes.csv not found in {path}")
+            raise typer.Exit(code=1)
+        if not edges_path.exists():
+            rich_print(f"[bold red]Error:[/bold red] edges.csv not found in {path}")
+            raise typer.Exit(code=1)
+    elif input_format == "json":
+        if not path.is_file() or path.suffix != ".json":
+            rich_print("[bold red]Error:[/bold red] For JSON format, path must be a .json file")
+            raise typer.Exit(code=1)
+    rich_print("--- [blue]Starting Docling-Graph Inspection[/blue] ---")
+    rich_print("\n[bold]Interactive Visualization[/bold]")
+    rich_print(f"  Input: [cyan]{path}[/cyan]")
+    rich_print(f"  Format: [cyan]{input_format}[/cyan]")
+    if output:
+        rich_print(f"  Output: [cyan]{output}[/cyan]")
+    else:
+        rich_print("  Output: [cyan]temporary file[/cyan]")
+    try:
+        # Create visualizer
+        visualizer = InteractiveVisualizer()
+        # Load and visualize
+        rich_print("\nLoading graph data...")
+        visualizer.display_cytoscape_graph(
+            path=path, input_format=input_format, output_path=output, open_browser=open_browser
+        )
+        rich_print("--- [blue]Docling-Graph Inspection Finished Successfully[/blue] ---")
+        if not open_browser:
+            rich_print(
+                "\n[blue]Tip:[/blue] Open the HTML file in your browser to view the visualization"
+            )
+    except Exception as e:
+        rich_print(f"[bold red]Error:[/bold red] {type(e).__name__}: {e}")
+        return None