PyPI - graphrag-input - Versions diffs - 3.0.0__tar.gz - Mend

graphrag-input 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

graphrag_input-3.0.0/.gitignore +65 -0
graphrag_input-3.0.0/LICENSE +21 -0
graphrag_input-3.0.0/PKG-INFO +92 -0
graphrag_input-3.0.0/README.md +72 -0
graphrag_input-3.0.0/graphrag_input/__init__.py +20 -0
graphrag_input-3.0.0/graphrag_input/csv.py +38 -0
graphrag_input-3.0.0/graphrag_input/get_property.py +36 -0
graphrag_input-3.0.0/graphrag_input/hashing.py +27 -0
graphrag_input-3.0.0/graphrag_input/input_config.py +40 -0
graphrag_input-3.0.0/graphrag_input/input_reader.py +75 -0
graphrag_input-3.0.0/graphrag_input/input_reader_factory.py +90 -0
graphrag_input-3.0.0/graphrag_input/input_type.py +25 -0
graphrag_input-3.0.0/graphrag_input/json.py +38 -0
graphrag_input-3.0.0/graphrag_input/jsonl.py +38 -0
graphrag_input-3.0.0/graphrag_input/markitdown.py +49 -0
graphrag_input-3.0.0/graphrag_input/structured_file_reader.py +65 -0
graphrag_input-3.0.0/graphrag_input/text.py +43 -0
graphrag_input-3.0.0/graphrag_input/text_document.py +59 -0
graphrag_input-3.0.0/pyproject.toml +44 -0

graphrag_input-3.0.0/.gitignore ADDED Viewed

@@ -0,0 +1,65 @@
+# Python Artifacts
+python/*/lib/
+dist/
+build/
+*.egg-info/
+# Test Output
+.coverage
+coverage/
+licenses.txt
+examples_notebooks/*/data
+tests/fixtures/cache
+tests/fixtures/*/cache
+tests/fixtures/*/output
+output/lancedb
+# Random
+.DS_Store
+*.log*
+.venv
+venv/
+.conda
+.tmp
+packages/graphrag-llm/notebooks/metrics
+packages/graphrag-llm/notebooks/cache
+.env
+build.zip
+.turbo
+__pycache__
+.pipeline
+# Azurite
+temp_azurite/
+__azurite*.json
+__blobstorage*.json
+__blobstorage__/
+# Getting started example
+ragtest/
+.ragtest/
+.pipelines
+.pipeline
+# mkdocs
+site/
+# Docs migration
+docsite/
+.yarn/
+.pnp*
+# PyCharm
+.idea/
+# Jupyter notebook
+.ipynb_checkpoints/
+# Root build assets
+packages/*/LICENSE

graphrag_input-3.0.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

graphrag_input-3.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,92 @@
+Metadata-Version: 2.4
+Name: graphrag-input
+Version: 3.0.0
+Summary: Input document loading utilities for GraphRAG
+Project-URL: Source, https://github.com/microsoft/graphrag
+Author: Mónica Carvajal
+Author-email: Alonso Guevara Fernández <alonsog@microsoft.com>, Andrés Morales Esquivel <andresmor@microsoft.com>, Chris Trevino <chtrevin@microsoft.com>, David Tittsworth <datittsw@microsoft.com>, Dayenne de Souza <ddesouza@microsoft.com>, Derek Worthen <deworthe@microsoft.com>, Gaudy Blanco Meneses <gaudyb@microsoft.com>, Ha Trinh <trinhha@microsoft.com>, Jonathan Larson <jolarso@microsoft.com>, Josh Bradley <joshbradley@microsoft.com>, Kate Lytvynets <kalytv@microsoft.com>, Kenny Zhang <zhangken@microsoft.com>, Nathan Evans <naevans@microsoft.com>, Rodrigo Racanicci <rracanicci@microsoft.com>, Sarah Smith <smithsarah@microsoft.com>
+License: MIT
+License-File: LICENSE
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Python: <3.14,>=3.11
+Requires-Dist: graphrag-common==3.0.0
+Requires-Dist: graphrag-storage==3.0.0
+Requires-Dist: markitdown~=0.1.0
+Requires-Dist: pydantic~=2.10
+Description-Content-Type: text/markdown
+# GraphRAG Inputs
+This package provides input document loading utilities for GraphRAG, supporting multiple file formats including CSV, JSON, JSON Lines, and plain text.
+## Supported File Types
+The following four standard file formats are supported out of the box:
+- **CSV** - Tabular data with configurable column mappings
+- **JSON** - JSON files with configurable property paths
+- **JSON Lines** - Line-delimited JSON records
+- **Text** - Plain text files
+### Markitdown Support
+Additionally, we support the `InputType.MarkItDown` format, which uses the [MarkItDown](https://github.com/microsoft/markitdown) library to import any supported file type. The MarkItDown converter can handle a wide variety of file formats including Office documents, PDFs, HTML, and more.
+**Note:** Additional optional dependencies may need to be installed depending on the file type you're processing. The choice of converter is determined by MarkItDowns's processing logic, which primarily uses the file extension to select the appropriate converter. Please refer to the [MarkItDown repository](https://github.com/microsoft/markitdown) for installation instructions and detailed information about supported formats.
+## Examples
+Basic usage with the factory:
+```python
+from graphrag_input import create_input_reader, InputConfig, InputType
+from graphrag_storage import StorageConfig, create_storage
+config = InputConfig(
+    type=InputType.Csv,
+    text_column="content",
+    title_column="title",
+)
+storage = create_storage(StorageConfig(base_dir="./input"))
+reader = create_input_reader(config, storage)
+documents = await reader.read_files()
+```
+Import a pdf with MarkItDown:
+```bash
+pip install 'markitdown[pdf]' # required dependency for pdf processing
+```
+```python
+from graphrag_input import create_input_reader, InputConfig, InputType
+from graphrag_storage import StorageConfig, create_storage
+config = InputConfig(
+    type=InputType.MarkitDown,
+    file_pattern=".*\\.pdf$"
+)
+storage = create_storage(StorageConfig(base_dir="./input"))
+reader = create_input_reader(config, storage)
+documents = await reader.read_files()
+```
+YAML config example for above:
+```yaml
+input:
+  type: markitdown
+  file_pattern: ".*\\.pdf$$"
+input_storage:
+    type: file
+    base_dir: "input"
+```
+Note that when specifying column names for data extraction, we can handle nested objects (e.g., in JSON) with dot notation:
+```python
+from graphrag_input import get_property
+data = {"user": {"profile": {"name": "Alice"}}}
+name = get_property(data, "user.profile.name")  # Returns "Alice"
+```

graphrag_input-3.0.0/README.md ADDED Viewed

@@ -0,0 +1,72 @@
+# GraphRAG Inputs
+This package provides input document loading utilities for GraphRAG, supporting multiple file formats including CSV, JSON, JSON Lines, and plain text.
+## Supported File Types
+The following four standard file formats are supported out of the box:
+- **CSV** - Tabular data with configurable column mappings
+- **JSON** - JSON files with configurable property paths
+- **JSON Lines** - Line-delimited JSON records
+- **Text** - Plain text files
+### Markitdown Support
+Additionally, we support the `InputType.MarkItDown` format, which uses the [MarkItDown](https://github.com/microsoft/markitdown) library to import any supported file type. The MarkItDown converter can handle a wide variety of file formats including Office documents, PDFs, HTML, and more.
+**Note:** Additional optional dependencies may need to be installed depending on the file type you're processing. The choice of converter is determined by MarkItDowns's processing logic, which primarily uses the file extension to select the appropriate converter. Please refer to the [MarkItDown repository](https://github.com/microsoft/markitdown) for installation instructions and detailed information about supported formats.
+## Examples
+Basic usage with the factory:
+```python
+from graphrag_input import create_input_reader, InputConfig, InputType
+from graphrag_storage import StorageConfig, create_storage
+config = InputConfig(
+    type=InputType.Csv,
+    text_column="content",
+    title_column="title",
+)
+storage = create_storage(StorageConfig(base_dir="./input"))
+reader = create_input_reader(config, storage)
+documents = await reader.read_files()
+```
+Import a pdf with MarkItDown:
+```bash
+pip install 'markitdown[pdf]' # required dependency for pdf processing
+```
+```python
+from graphrag_input import create_input_reader, InputConfig, InputType
+from graphrag_storage import StorageConfig, create_storage
+config = InputConfig(
+    type=InputType.MarkitDown,
+    file_pattern=".*\\.pdf$"
+)
+storage = create_storage(StorageConfig(base_dir="./input"))
+reader = create_input_reader(config, storage)
+documents = await reader.read_files()
+```
+YAML config example for above:
+```yaml
+input:
+  type: markitdown
+  file_pattern: ".*\\.pdf$$"
+input_storage:
+    type: file
+    base_dir: "input"
+```
+Note that when specifying column names for data extraction, we can handle nested objects (e.g., in JSON) with dot notation:
+```python
+from graphrag_input import get_property
+data = {"user": {"profile": {"name": "Alice"}}}
+name = get_property(data, "user.profile.name")  # Returns "Alice"
+```

graphrag_input-3.0.0/graphrag_input/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""GraphRAG input document loading package."""
+from graphrag_input.get_property import get_property
+from graphrag_input.input_config import InputConfig
+from graphrag_input.input_reader import InputReader
+from graphrag_input.input_reader_factory import create_input_reader
+from graphrag_input.input_type import InputType
+from graphrag_input.text_document import TextDocument
+__all__ = [
+    "InputConfig",
+    "InputReader",
+    "InputType",
+    "TextDocument",
+    "create_input_reader",
+    "get_property",
+]

graphrag_input-3.0.0/graphrag_input/csv.py ADDED Viewed

@@ -0,0 +1,38 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""A module containing 'CSVFileReader' model."""
+import csv
+import logging
+from graphrag_input.structured_file_reader import StructuredFileReader
+from graphrag_input.text_document import TextDocument
+logger = logging.getLogger(__name__)
+class CSVFileReader(StructuredFileReader):
+    """Reader implementation for csv files."""
+    def __init__(self, file_pattern: str | None = None, **kwargs):
+        super().__init__(
+            file_pattern=file_pattern if file_pattern is not None else ".*\\.csv$",
+            **kwargs,
+        )
+    async def read_file(self, path: str) -> list[TextDocument]:
+        """Read a csv file into a list of documents.
+        Args:
+            - path - The path to read the file from.
+        Returns
+        -------
+            - output - list with a TextDocument for each row in the file.
+        """
+        file = await self._storage.get(path, encoding=self._encoding)
+        reader = csv.DictReader(file.splitlines())
+        rows = list(reader)
+        return await self.process_data_columns(rows, path)

graphrag_input-3.0.0/graphrag_input/get_property.py ADDED Viewed

@@ -0,0 +1,36 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""Utility for retrieving properties from nested dictionaries."""
+from typing import Any
+def get_property(data: dict[str, Any], path: str) -> Any:
+    """Retrieve a property from a dictionary using dot notation.
+    Parameters
+    ----------
+    data : dict[str, Any]
+        The dictionary to retrieve the property from.
+    path : str
+        A dot-separated string representing the path to the property (e.g., "foo.bar.baz").
+    Returns
+    -------
+    Any
+        The value at the specified path.
+    Raises
+    ------
+    KeyError
+        If the path does not exist in the dictionary.
+    """
+    keys = path.split(".")
+    current = data
+    for key in keys:
+        if not isinstance(current, dict) or key not in current:
+            msg = f"Property '{path}' not found"
+            raise KeyError(msg)
+        current = current[key]
+    return current

graphrag_input-3.0.0/graphrag_input/hashing.py ADDED Viewed

@@ -0,0 +1,27 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""Hashing utilities."""
+from collections.abc import Iterable
+from hashlib import sha512
+from typing import Any
+def gen_sha512_hash(item: dict[str, Any], hashcode: Iterable[str]) -> str:
+    """Generate a SHA512 hash.
+    Parameters
+    ----------
+    item : dict[str, Any]
+        The dictionary containing values to hash.
+    hashcode : Iterable[str]
+        The keys to include in the hash.
+    Returns
+    -------
+    str
+        The SHA512 hash as a hexadecimal string.
+    """
+    hashed = "".join([str(item[column]) for column in hashcode])
+    return f"{sha512(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"

graphrag_input-3.0.0/graphrag_input/input_config.py ADDED Viewed

@@ -0,0 +1,40 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""Parameterization settings for the default configuration."""
+from pydantic import BaseModel, ConfigDict, Field
+from graphrag_input.input_type import InputType
+class InputConfig(BaseModel):
+    """The default configuration section for Input."""
+    model_config = ConfigDict(extra="allow")
+    """Allow extra fields to support custom reader implementations."""
+    type: str = Field(
+        description="The input file type to use.",
+        default=InputType.Text,
+    )
+    encoding: str | None = Field(
+        description="The input file encoding to use.",
+        default=None,
+    )
+    file_pattern: str | None = Field(
+        description="The input file pattern to use.",
+        default=None,
+    )
+    id_column: str | None = Field(
+        description="The input ID column to use.",
+        default=None,
+    )
+    title_column: str | None = Field(
+        description="The input title column to use.",
+        default=None,
+    )
+    text_column: str | None = Field(
+        description="The input text column to use.",
+        default=None,
+    )

graphrag_input-3.0.0/graphrag_input/input_reader.py ADDED Viewed

@@ -0,0 +1,75 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""A module containing 'InputReader' model."""
+from __future__ import annotations
+import logging
+import re
+from abc import ABCMeta, abstractmethod
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from graphrag_storage import Storage
+    from graphrag_input.text_document import TextDocument
+logger = logging.getLogger(__name__)
+class InputReader(metaclass=ABCMeta):
+    """Provide a cache interface for the pipeline."""
+    def __init__(
+        self,
+        storage: Storage,
+        file_pattern: str,
+        encoding: str = "utf-8",
+        **kwargs,
+    ):
+        self._storage = storage
+        self._encoding = encoding
+        self._file_pattern = file_pattern
+    async def read_files(self) -> list[TextDocument]:
+        """Load files from storage and apply a loader function based on file type. Process metadata on the results if needed."""
+        files = list(self._storage.find(re.compile(self._file_pattern)))
+        if len(files) == 0:
+            msg = f"No {self._file_pattern} matches found in storage"
+            logger.warning(msg)
+            files = []
+        documents: list[TextDocument] = []
+        for file in files:
+            try:
+                documents.extend(await self.read_file(file))
+            except Exception as e:  # noqa: BLE001 (catching Exception is fine here)
+                logger.warning("Warning! Error loading file %s. Skipping...", file)
+                logger.warning("Error: %s", e)
+        logger.info(
+            "Found %d %s files, loading %d",
+            len(files),
+            self._file_pattern,
+            len(documents),
+        )
+        total_files_log = (
+            f"Total number of unfiltered {self._file_pattern} rows: {len(documents)}"
+        )
+        logger.info(total_files_log)
+        return documents
+    @abstractmethod
+    async def read_file(self, path: str) -> list[TextDocument]:
+        """Read a file into a list of documents.
+        Args:
+            - path - The path to read the file from.
+        Returns
+        -------
+            - output - List with an entry for each document in the file.
+        """

graphrag_input-3.0.0/graphrag_input/input_reader_factory.py ADDED Viewed

@@ -0,0 +1,90 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""A module containing 'InputReaderFactory' model."""
+import logging
+from collections.abc import Callable
+from graphrag_common.factory import Factory
+from graphrag_common.factory.factory import ServiceScope
+from graphrag_storage.storage import Storage
+from graphrag_input.input_config import InputConfig
+from graphrag_input.input_reader import InputReader
+from graphrag_input.input_type import InputType
+logger = logging.getLogger(__name__)
+class InputReaderFactory(Factory[InputReader]):
+    """Factory for creating Input Reader instances."""
+input_reader_factory = InputReaderFactory()
+def register_input_reader(
+    input_reader_type: str,
+    input_reader_initializer: Callable[..., InputReader],
+    scope: ServiceScope = "transient",
+) -> None:
+    """Register a custom input reader implementation.
+    Args
+    ----
+        - input_reader_type: str
+            The input reader id to register.
+        - input_reader_initializer: Callable[..., InputReader]
+            The input reader initializer to register.
+    """
+    input_reader_factory.register(input_reader_type, input_reader_initializer, scope)
+def create_input_reader(config: InputConfig, storage: Storage) -> InputReader:
+    """Create an input reader implementation based on the given configuration.
+    Args
+    ----
+        - config: InputConfig
+            The input reader configuration to use.
+        - storage: Storage | None
+            The storage implementation to use for reading the files.
+    Returns
+    -------
+        InputReader
+            The created input reader implementation.
+    """
+    config_model = config.model_dump()
+    input_strategy = config.type
+    if input_strategy not in input_reader_factory:
+        match input_strategy:
+            case InputType.Csv:
+                from graphrag_input.csv import CSVFileReader
+                register_input_reader(InputType.Csv, CSVFileReader)
+            case InputType.Text:
+                from graphrag_input.text import TextFileReader
+                register_input_reader(InputType.Text, TextFileReader)
+            case InputType.Json:
+                from graphrag_input.json import JSONFileReader
+                register_input_reader(InputType.Json, JSONFileReader)
+            case InputType.JsonLines:
+                from graphrag_input.jsonl import JSONLinesFileReader
+                register_input_reader(InputType.JsonLines, JSONLinesFileReader)
+            case InputType.MarkItDown:
+                from graphrag_input.markitdown import MarkItDownFileReader
+                register_input_reader(InputType.MarkItDown, MarkItDownFileReader)
+            case _:
+                msg = f"InputConfig.type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
+                raise ValueError(msg)
+    config_model["storage"] = storage
+    return input_reader_factory.create(input_strategy, init_args=config_model)

graphrag_input-3.0.0/graphrag_input/input_type.py ADDED Viewed

@@ -0,0 +1,25 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""A module containing input file type enum."""
+from enum import StrEnum
+class InputType(StrEnum):
+    """The input file type for the pipeline."""
+    Csv = "csv"
+    """The CSV input type."""
+    Text = "text"
+    """The text input type."""
+    Json = "json"
+    """The JSON input type."""
+    JsonLines = "jsonl"
+    """The JSON Lines input type."""
+    MarkItDown = "markitdown"
+    """The MarkItDown input type."""
+    def __repr__(self):
+        """Get a string representation."""
+        return f'"{self.value}"'

graphrag_input-3.0.0/graphrag_input/json.py ADDED Viewed

@@ -0,0 +1,38 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""A module containing 'JSONFileReader' model."""
+import json
+import logging
+from graphrag_input.structured_file_reader import StructuredFileReader
+from graphrag_input.text_document import TextDocument
+logger = logging.getLogger(__name__)
+class JSONFileReader(StructuredFileReader):
+    """Reader implementation for json files."""
+    def __init__(self, file_pattern: str | None = None, **kwargs):
+        super().__init__(
+            file_pattern=file_pattern if file_pattern is not None else ".*\\.json$",
+            **kwargs,
+        )
+    async def read_file(self, path: str) -> list[TextDocument]:
+        """Read a JSON file into a list of documents.
+        Args:
+            - path - The path to read the file from.
+        Returns
+        -------
+            - output - list with a TextDocument for each row in the file.
+        """
+        text = await self._storage.get(path, encoding=self._encoding)
+        as_json = json.loads(text)
+        # json file could just be a single object, or an array of objects
+        rows = as_json if isinstance(as_json, list) else [as_json]
+        return await self.process_data_columns(rows, path)

graphrag_input-3.0.0/graphrag_input/jsonl.py ADDED Viewed

@@ -0,0 +1,38 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""A module containing 'JSONLinesFileReader' model."""
+import json
+import logging
+from graphrag_input.structured_file_reader import StructuredFileReader
+from graphrag_input.text_document import TextDocument
+logger = logging.getLogger(__name__)
+class JSONLinesFileReader(StructuredFileReader):
+    """Reader implementation for json lines files."""
+    def __init__(self, file_pattern: str | None = None, **kwargs):
+        super().__init__(
+            file_pattern=file_pattern if file_pattern is not None else ".*\\.jsonl$",
+            **kwargs,
+        )
+    async def read_file(self, path: str) -> list[TextDocument]:
+        """Read a JSON lines file into a list of documents.
+        This differs from standard JSON files in that each line is a separate JSON object.
+        Args:
+            - path - The path to read the file from.
+        Returns
+        -------
+            - output - list with a TextDocument for each row in the file.
+        """
+        text = await self._storage.get(path, encoding=self._encoding)
+        rows = [json.loads(line) for line in text.splitlines()]
+        return await self.process_data_columns(rows, path)

graphrag_input-3.0.0/graphrag_input/markitdown.py ADDED Viewed

@@ -0,0 +1,49 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""A module containing 'TextFileReader' model."""
+import logging
+from io import BytesIO
+from pathlib import Path
+from markitdown import MarkItDown, StreamInfo
+from graphrag_input.hashing import gen_sha512_hash
+from graphrag_input.input_reader import InputReader
+from graphrag_input.text_document import TextDocument
+logger = logging.getLogger(__name__)
+class MarkItDownFileReader(InputReader):
+    """Reader implementation for any file type supported by markitdown.
+    https://github.com/microsoft/markitdown
+    """
+    async def read_file(self, path: str) -> list[TextDocument]:
+        """Read a text file into a DataFrame of documents.
+        Args:
+            - path - The path to read the file from.
+        Returns
+        -------
+            - output - list with a TextDocument for each row in the file.
+        """
+        bytes = await self._storage.get(path, encoding=self._encoding, as_bytes=True)
+        md = MarkItDown()
+        result = md.convert_stream(
+            BytesIO(bytes), stream_info=StreamInfo(extension=Path(path).suffix)
+        )
+        text = result.markdown
+        document = TextDocument(
+            id=gen_sha512_hash({"text": text}, ["text"]),
+            title=result.title if result.title else str(Path(path).name),
+            text=text,
+            creation_date=await self._storage.get_creation_date(path),
+            raw_data=None,
+        )
+        return [document]

graphrag_input-3.0.0/graphrag_input/structured_file_reader.py ADDED Viewed

@@ -0,0 +1,65 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""A module containing 'StructuredFileReader' model."""
+import logging
+from typing import Any
+from graphrag_input.get_property import get_property
+from graphrag_input.hashing import gen_sha512_hash
+from graphrag_input.input_reader import InputReader
+from graphrag_input.text_document import TextDocument
+logger = logging.getLogger(__name__)
+class StructuredFileReader(InputReader):
+    """Base reader implementation for structured files such as csv and json."""
+    def __init__(
+        self,
+        id_column: str | None = None,
+        title_column: str | None = None,
+        text_column: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self._id_column = id_column
+        self._title_column = title_column
+        self._text_column = text_column
+    async def process_data_columns(
+        self,
+        rows: list[dict[str, Any]],
+        path: str,
+    ) -> list[TextDocument]:
+        """Process configured data columns from a list of loaded dicts."""
+        documents = []
+        for index, row in enumerate(rows):
+            # text column is required - harvest from dict
+            text = get_property(row, self._text_column)
+            # id is optional - harvest from dict or hash from text
+            id = (
+                get_property(row, self._id_column)
+                if self._id_column
+                else gen_sha512_hash({"text": text}, ["text"])
+            )
+            # title is optional - harvest from dict or use filename
+            num = f" ({index})" if len(rows) > 1 else ""
+            title = (
+                get_property(row, self._title_column)
+                if self._title_column
+                else f"{path}{num}"
+            )
+            creation_date = await self._storage.get_creation_date(path)
+            documents.append(
+                TextDocument(
+                    id=id,
+                    title=title,
+                    text=text,
+                    creation_date=creation_date,
+                    raw_data=row,
+                )
+            )
+        return documents

graphrag_input-3.0.0/graphrag_input/text.py ADDED Viewed

@@ -0,0 +1,43 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""A module containing 'TextFileReader' model."""
+import logging
+from pathlib import Path
+from graphrag_input.hashing import gen_sha512_hash
+from graphrag_input.input_reader import InputReader
+from graphrag_input.text_document import TextDocument
+logger = logging.getLogger(__name__)
+class TextFileReader(InputReader):
+    """Reader implementation for text files."""
+    def __init__(self, file_pattern: str | None = None, **kwargs):
+        super().__init__(
+            file_pattern=file_pattern if file_pattern is not None else ".*\\.txt$",
+            **kwargs,
+        )
+    async def read_file(self, path: str) -> list[TextDocument]:
+        """Read a text file into a list of documents.
+        Args:
+            - path - The path to read the file from.
+        Returns
+        -------
+            - output - list with a TextDocument for each row in the file.
+        """
+        text = await self._storage.get(path, encoding=self._encoding)
+        document = TextDocument(
+            id=gen_sha512_hash({"text": text}, ["text"]),
+            title=str(Path(path).name),
+            text=text,
+            creation_date=await self._storage.get_creation_date(path),
+            raw_data=None,
+        )
+        return [document]

graphrag_input-3.0.0/graphrag_input/text_document.py ADDED Viewed

@@ -0,0 +1,59 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+"""TextDocument dataclass."""
+import logging
+from dataclasses import dataclass
+from typing import Any
+from graphrag_input.get_property import get_property
+logger = logging.getLogger(__name__)
+@dataclass
+class TextDocument:
+    """The TextDocument holds relevant content for GraphRAG indexing."""
+    id: str
+    """Unique identifier for the document."""
+    text: str
+    """The main text content of the document."""
+    title: str
+    """The title of the document."""
+    creation_date: str
+    """The creation date of the document, ISO-8601 format."""
+    raw_data: dict[str, Any] | None = None
+    """Raw data from source document."""
+    def get(self, field: str, default_value: Any = None) -> Any:
+        """
+        Get a single field from the TextDocument.
+        Functions like the get method on a dictionary, returning default_value if the field is not found.
+        Supports nested fields using dot notation.
+        This takes a two step approach for flexibility:
+        1. If the field is one of the standard text document fields (id, title, text, creation_date), just grab it directly. This accommodates unstructured text for example, which just has the standard fields.
+        2. Otherwise. try to extract it from the raw_data dict. This allows users to specify any column from the original input file.
+        """
+        if field in ["id", "title", "text", "creation_date"]:
+            return getattr(self, field)
+        raw = self.raw_data or {}
+        try:
+            return get_property(raw, field)
+        except KeyError:
+            return default_value
+    def collect(self, fields: list[str]) -> dict[str, Any]:
+        """Extract data fields from a TextDocument into a dict."""
+        data = {}
+        for field in fields:
+            value = self.get(field)
+            if value is not None:
+                data[field] = value
+        return data

graphrag_input-3.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,44 @@
+[project]
+name = "graphrag-input"
+version = "3.0.0"
+description = "Input document loading utilities for GraphRAG"
+authors = [
+    {name = "Alonso Guevara Fernández", email = "alonsog@microsoft.com"},
+    {name = "Andrés Morales Esquivel", email = "andresmor@microsoft.com"},
+    {name = "Chris Trevino", email = "chtrevin@microsoft.com"},
+    {name = "David Tittsworth", email = "datittsw@microsoft.com"},
+    {name = "Dayenne de Souza", email = "ddesouza@microsoft.com"},
+    {name = "Derek Worthen", email = "deworthe@microsoft.com"},
+    {name = "Gaudy Blanco Meneses", email = "gaudyb@microsoft.com"},
+    {name = "Ha Trinh", email = "trinhha@microsoft.com"},
+    {name = "Jonathan Larson", email = "jolarso@microsoft.com"},
+    {name = "Josh Bradley", email = "joshbradley@microsoft.com"},
+    {name = "Kate Lytvynets", email = "kalytv@microsoft.com"},
+    {name = "Kenny Zhang", email = "zhangken@microsoft.com"},
+    {name = "Mónica Carvajal"},
+    {name = "Nathan Evans", email = "naevans@microsoft.com"},
+    {name = "Rodrigo Racanicci", email = "rracanicci@microsoft.com"},
+    {name = "Sarah Smith", email = "smithsarah@microsoft.com"},
+]
+license = {text = "MIT"}
+readme = "README.md"
+requires-python = ">=3.11,<3.14"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "graphrag-common==3.0.0",
+    "graphrag-storage==3.0.0    ",
+    "pydantic~=2.10",
+    "markitdown~=0.1.0"
+]
+[project.urls]
+Source = "https://github.com/microsoft/graphrag"
+[build-system]
+requires = ["hatchling>=1.27.0,<2.0.0"]
+build-backend = "hatchling.build"