graphrag-input 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,65 @@
1
+ # Python Artifacts
2
+ python/*/lib/
3
+ dist/
4
+ build/
5
+ *.egg-info/
6
+
7
+ # Test Output
8
+ .coverage
9
+ coverage/
10
+ licenses.txt
11
+ examples_notebooks/*/data
12
+ tests/fixtures/cache
13
+ tests/fixtures/*/cache
14
+ tests/fixtures/*/output
15
+ output/lancedb
16
+
17
+
18
+ # Random
19
+ .DS_Store
20
+ *.log*
21
+ .venv
22
+ venv/
23
+ .conda
24
+ .tmp
25
+ packages/graphrag-llm/notebooks/metrics
26
+ packages/graphrag-llm/notebooks/cache
27
+
28
+ .env
29
+ build.zip
30
+
31
+ .turbo
32
+
33
+ __pycache__
34
+
35
+ .pipeline
36
+
37
+ # Azurite
38
+ temp_azurite/
39
+ __azurite*.json
40
+ __blobstorage*.json
41
+ __blobstorage__/
42
+
43
+ # Getting started example
44
+ ragtest/
45
+ .ragtest/
46
+ .pipelines
47
+ .pipeline
48
+
49
+
50
+ # mkdocs
51
+ site/
52
+
53
+ # Docs migration
54
+ docsite/
55
+ .yarn/
56
+ .pnp*
57
+
58
+ # PyCharm
59
+ .idea/
60
+
61
+ # Jupyter notebook
62
+ .ipynb_checkpoints/
63
+
64
+ # Root build assets
65
+ packages/*/LICENSE
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) Microsoft Corporation.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: graphrag-input
3
+ Version: 3.0.0
4
+ Summary: Input document loading utilities for GraphRAG
5
+ Project-URL: Source, https://github.com/microsoft/graphrag
6
+ Author: Mónica Carvajal
7
+ Author-email: Alonso Guevara Fernández <alonsog@microsoft.com>, Andrés Morales Esquivel <andresmor@microsoft.com>, Chris Trevino <chtrevin@microsoft.com>, David Tittsworth <datittsw@microsoft.com>, Dayenne de Souza <ddesouza@microsoft.com>, Derek Worthen <deworthe@microsoft.com>, Gaudy Blanco Meneses <gaudyb@microsoft.com>, Ha Trinh <trinhha@microsoft.com>, Jonathan Larson <jolarso@microsoft.com>, Josh Bradley <joshbradley@microsoft.com>, Kate Lytvynets <kalytv@microsoft.com>, Kenny Zhang <zhangken@microsoft.com>, Nathan Evans <naevans@microsoft.com>, Rodrigo Racanicci <rracanicci@microsoft.com>, Sarah Smith <smithsarah@microsoft.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Python: <3.14,>=3.11
15
+ Requires-Dist: graphrag-common==3.0.0
16
+ Requires-Dist: graphrag-storage==3.0.0
17
+ Requires-Dist: markitdown~=0.1.0
18
+ Requires-Dist: pydantic~=2.10
19
+ Description-Content-Type: text/markdown
20
+
21
+ # GraphRAG Inputs
22
+
23
+ This package provides input document loading utilities for GraphRAG, supporting multiple file formats including CSV, JSON, JSON Lines, and plain text.
24
+
25
+ ## Supported File Types
26
+
27
+ The following four standard file formats are supported out of the box:
28
+
29
+ - **CSV** - Tabular data with configurable column mappings
30
+ - **JSON** - JSON files with configurable property paths
31
+ - **JSON Lines** - Line-delimited JSON records
32
+ - **Text** - Plain text files
33
+
34
+ ### Markitdown Support
35
+
36
+ Additionally, we support the `InputType.MarkItDown` format, which uses the [MarkItDown](https://github.com/microsoft/markitdown) library to import any supported file type. The MarkItDown converter can handle a wide variety of file formats including Office documents, PDFs, HTML, and more.
37
+
38
+ **Note:** Additional optional dependencies may need to be installed depending on the file type you're processing. The choice of converter is determined by MarkItDowns's processing logic, which primarily uses the file extension to select the appropriate converter. Please refer to the [MarkItDown repository](https://github.com/microsoft/markitdown) for installation instructions and detailed information about supported formats.
39
+
40
+ ## Examples
41
+
42
+ Basic usage with the factory:
43
+ ```python
44
+ from graphrag_input import create_input_reader, InputConfig, InputType
45
+ from graphrag_storage import StorageConfig, create_storage
46
+
47
+ config = InputConfig(
48
+ type=InputType.Csv,
49
+ text_column="content",
50
+ title_column="title",
51
+ )
52
+ storage = create_storage(StorageConfig(base_dir="./input"))
53
+ reader = create_input_reader(config, storage)
54
+ documents = await reader.read_files()
55
+ ```
56
+
57
+ Import a pdf with MarkItDown:
58
+
59
+ ```bash
60
+ pip install 'markitdown[pdf]' # required dependency for pdf processing
61
+ ```
62
+
63
+ ```python
64
+ from graphrag_input import create_input_reader, InputConfig, InputType
65
+ from graphrag_storage import StorageConfig, create_storage
66
+
67
+ config = InputConfig(
68
+ type=InputType.MarkitDown,
69
+ file_pattern=".*\\.pdf$"
70
+ )
71
+ storage = create_storage(StorageConfig(base_dir="./input"))
72
+ reader = create_input_reader(config, storage)
73
+ documents = await reader.read_files()
74
+ ```
75
+
76
+ YAML config example for above:
77
+ ```yaml
78
+ input:
79
+ type: markitdown
80
+ file_pattern: ".*\\.pdf$$"
81
+ input_storage:
82
+ type: file
83
+ base_dir: "input"
84
+ ```
85
+
86
+ Note that when specifying column names for data extraction, we can handle nested objects (e.g., in JSON) with dot notation:
87
+ ```python
88
+ from graphrag_input import get_property
89
+
90
+ data = {"user": {"profile": {"name": "Alice"}}}
91
+ name = get_property(data, "user.profile.name") # Returns "Alice"
92
+ ```
@@ -0,0 +1,72 @@
1
+ # GraphRAG Inputs
2
+
3
+ This package provides input document loading utilities for GraphRAG, supporting multiple file formats including CSV, JSON, JSON Lines, and plain text.
4
+
5
+ ## Supported File Types
6
+
7
+ The following four standard file formats are supported out of the box:
8
+
9
+ - **CSV** - Tabular data with configurable column mappings
10
+ - **JSON** - JSON files with configurable property paths
11
+ - **JSON Lines** - Line-delimited JSON records
12
+ - **Text** - Plain text files
13
+
14
+ ### Markitdown Support
15
+
16
+ Additionally, we support the `InputType.MarkItDown` format, which uses the [MarkItDown](https://github.com/microsoft/markitdown) library to import any supported file type. The MarkItDown converter can handle a wide variety of file formats including Office documents, PDFs, HTML, and more.
17
+
18
+ **Note:** Additional optional dependencies may need to be installed depending on the file type you're processing. The choice of converter is determined by MarkItDowns's processing logic, which primarily uses the file extension to select the appropriate converter. Please refer to the [MarkItDown repository](https://github.com/microsoft/markitdown) for installation instructions and detailed information about supported formats.
19
+
20
+ ## Examples
21
+
22
+ Basic usage with the factory:
23
+ ```python
24
+ from graphrag_input import create_input_reader, InputConfig, InputType
25
+ from graphrag_storage import StorageConfig, create_storage
26
+
27
+ config = InputConfig(
28
+ type=InputType.Csv,
29
+ text_column="content",
30
+ title_column="title",
31
+ )
32
+ storage = create_storage(StorageConfig(base_dir="./input"))
33
+ reader = create_input_reader(config, storage)
34
+ documents = await reader.read_files()
35
+ ```
36
+
37
+ Import a pdf with MarkItDown:
38
+
39
+ ```bash
40
+ pip install 'markitdown[pdf]' # required dependency for pdf processing
41
+ ```
42
+
43
+ ```python
44
+ from graphrag_input import create_input_reader, InputConfig, InputType
45
+ from graphrag_storage import StorageConfig, create_storage
46
+
47
+ config = InputConfig(
48
+ type=InputType.MarkitDown,
49
+ file_pattern=".*\\.pdf$"
50
+ )
51
+ storage = create_storage(StorageConfig(base_dir="./input"))
52
+ reader = create_input_reader(config, storage)
53
+ documents = await reader.read_files()
54
+ ```
55
+
56
+ YAML config example for above:
57
+ ```yaml
58
+ input:
59
+ type: markitdown
60
+ file_pattern: ".*\\.pdf$$"
61
+ input_storage:
62
+ type: file
63
+ base_dir: "input"
64
+ ```
65
+
66
+ Note that when specifying column names for data extraction, we can handle nested objects (e.g., in JSON) with dot notation:
67
+ ```python
68
+ from graphrag_input import get_property
69
+
70
+ data = {"user": {"profile": {"name": "Alice"}}}
71
+ name = get_property(data, "user.profile.name") # Returns "Alice"
72
+ ```
@@ -0,0 +1,20 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """GraphRAG input document loading package."""
5
+
6
+ from graphrag_input.get_property import get_property
7
+ from graphrag_input.input_config import InputConfig
8
+ from graphrag_input.input_reader import InputReader
9
+ from graphrag_input.input_reader_factory import create_input_reader
10
+ from graphrag_input.input_type import InputType
11
+ from graphrag_input.text_document import TextDocument
12
+
13
+ __all__ = [
14
+ "InputConfig",
15
+ "InputReader",
16
+ "InputType",
17
+ "TextDocument",
18
+ "create_input_reader",
19
+ "get_property",
20
+ ]
@@ -0,0 +1,38 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A module containing 'CSVFileReader' model."""
5
+
6
+ import csv
7
+ import logging
8
+
9
+ from graphrag_input.structured_file_reader import StructuredFileReader
10
+ from graphrag_input.text_document import TextDocument
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class CSVFileReader(StructuredFileReader):
16
+ """Reader implementation for csv files."""
17
+
18
+ def __init__(self, file_pattern: str | None = None, **kwargs):
19
+ super().__init__(
20
+ file_pattern=file_pattern if file_pattern is not None else ".*\\.csv$",
21
+ **kwargs,
22
+ )
23
+
24
+ async def read_file(self, path: str) -> list[TextDocument]:
25
+ """Read a csv file into a list of documents.
26
+
27
+ Args:
28
+ - path - The path to read the file from.
29
+
30
+ Returns
31
+ -------
32
+ - output - list with a TextDocument for each row in the file.
33
+ """
34
+ file = await self._storage.get(path, encoding=self._encoding)
35
+
36
+ reader = csv.DictReader(file.splitlines())
37
+ rows = list(reader)
38
+ return await self.process_data_columns(rows, path)
@@ -0,0 +1,36 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """Utility for retrieving properties from nested dictionaries."""
5
+
6
+ from typing import Any
7
+
8
+
9
+ def get_property(data: dict[str, Any], path: str) -> Any:
10
+ """Retrieve a property from a dictionary using dot notation.
11
+
12
+ Parameters
13
+ ----------
14
+ data : dict[str, Any]
15
+ The dictionary to retrieve the property from.
16
+ path : str
17
+ A dot-separated string representing the path to the property (e.g., "foo.bar.baz").
18
+
19
+ Returns
20
+ -------
21
+ Any
22
+ The value at the specified path.
23
+
24
+ Raises
25
+ ------
26
+ KeyError
27
+ If the path does not exist in the dictionary.
28
+ """
29
+ keys = path.split(".")
30
+ current = data
31
+ for key in keys:
32
+ if not isinstance(current, dict) or key not in current:
33
+ msg = f"Property '{path}' not found"
34
+ raise KeyError(msg)
35
+ current = current[key]
36
+ return current
@@ -0,0 +1,27 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """Hashing utilities."""
5
+
6
+ from collections.abc import Iterable
7
+ from hashlib import sha512
8
+ from typing import Any
9
+
10
+
11
+ def gen_sha512_hash(item: dict[str, Any], hashcode: Iterable[str]) -> str:
12
+ """Generate a SHA512 hash.
13
+
14
+ Parameters
15
+ ----------
16
+ item : dict[str, Any]
17
+ The dictionary containing values to hash.
18
+ hashcode : Iterable[str]
19
+ The keys to include in the hash.
20
+
21
+ Returns
22
+ -------
23
+ str
24
+ The SHA512 hash as a hexadecimal string.
25
+ """
26
+ hashed = "".join([str(item[column]) for column in hashcode])
27
+ return f"{sha512(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"
@@ -0,0 +1,40 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """Parameterization settings for the default configuration."""
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+
8
+ from graphrag_input.input_type import InputType
9
+
10
+
11
+ class InputConfig(BaseModel):
12
+ """The default configuration section for Input."""
13
+
14
+ model_config = ConfigDict(extra="allow")
15
+ """Allow extra fields to support custom reader implementations."""
16
+
17
+ type: str = Field(
18
+ description="The input file type to use.",
19
+ default=InputType.Text,
20
+ )
21
+ encoding: str | None = Field(
22
+ description="The input file encoding to use.",
23
+ default=None,
24
+ )
25
+ file_pattern: str | None = Field(
26
+ description="The input file pattern to use.",
27
+ default=None,
28
+ )
29
+ id_column: str | None = Field(
30
+ description="The input ID column to use.",
31
+ default=None,
32
+ )
33
+ title_column: str | None = Field(
34
+ description="The input title column to use.",
35
+ default=None,
36
+ )
37
+ text_column: str | None = Field(
38
+ description="The input text column to use.",
39
+ default=None,
40
+ )
@@ -0,0 +1,75 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A module containing 'InputReader' model."""
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import re
10
+ from abc import ABCMeta, abstractmethod
11
+ from typing import TYPE_CHECKING
12
+
13
+ if TYPE_CHECKING:
14
+ from graphrag_storage import Storage
15
+
16
+ from graphrag_input.text_document import TextDocument
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class InputReader(metaclass=ABCMeta):
22
+ """Provide a cache interface for the pipeline."""
23
+
24
+ def __init__(
25
+ self,
26
+ storage: Storage,
27
+ file_pattern: str,
28
+ encoding: str = "utf-8",
29
+ **kwargs,
30
+ ):
31
+ self._storage = storage
32
+ self._encoding = encoding
33
+ self._file_pattern = file_pattern
34
+
35
+ async def read_files(self) -> list[TextDocument]:
36
+ """Load files from storage and apply a loader function based on file type. Process metadata on the results if needed."""
37
+ files = list(self._storage.find(re.compile(self._file_pattern)))
38
+ if len(files) == 0:
39
+ msg = f"No {self._file_pattern} matches found in storage"
40
+ logger.warning(msg)
41
+ files = []
42
+
43
+ documents: list[TextDocument] = []
44
+
45
+ for file in files:
46
+ try:
47
+ documents.extend(await self.read_file(file))
48
+ except Exception as e: # noqa: BLE001 (catching Exception is fine here)
49
+ logger.warning("Warning! Error loading file %s. Skipping...", file)
50
+ logger.warning("Error: %s", e)
51
+
52
+ logger.info(
53
+ "Found %d %s files, loading %d",
54
+ len(files),
55
+ self._file_pattern,
56
+ len(documents),
57
+ )
58
+ total_files_log = (
59
+ f"Total number of unfiltered {self._file_pattern} rows: {len(documents)}"
60
+ )
61
+ logger.info(total_files_log)
62
+
63
+ return documents
64
+
65
+ @abstractmethod
66
+ async def read_file(self, path: str) -> list[TextDocument]:
67
+ """Read a file into a list of documents.
68
+
69
+ Args:
70
+ - path - The path to read the file from.
71
+
72
+ Returns
73
+ -------
74
+ - output - List with an entry for each document in the file.
75
+ """
@@ -0,0 +1,90 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A module containing 'InputReaderFactory' model."""
5
+
6
+ import logging
7
+ from collections.abc import Callable
8
+
9
+ from graphrag_common.factory import Factory
10
+ from graphrag_common.factory.factory import ServiceScope
11
+ from graphrag_storage.storage import Storage
12
+
13
+ from graphrag_input.input_config import InputConfig
14
+ from graphrag_input.input_reader import InputReader
15
+ from graphrag_input.input_type import InputType
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class InputReaderFactory(Factory[InputReader]):
21
+ """Factory for creating Input Reader instances."""
22
+
23
+
24
+ input_reader_factory = InputReaderFactory()
25
+
26
+
27
+ def register_input_reader(
28
+ input_reader_type: str,
29
+ input_reader_initializer: Callable[..., InputReader],
30
+ scope: ServiceScope = "transient",
31
+ ) -> None:
32
+ """Register a custom input reader implementation.
33
+
34
+ Args
35
+ ----
36
+ - input_reader_type: str
37
+ The input reader id to register.
38
+ - input_reader_initializer: Callable[..., InputReader]
39
+ The input reader initializer to register.
40
+ """
41
+ input_reader_factory.register(input_reader_type, input_reader_initializer, scope)
42
+
43
+
44
+ def create_input_reader(config: InputConfig, storage: Storage) -> InputReader:
45
+ """Create an input reader implementation based on the given configuration.
46
+
47
+ Args
48
+ ----
49
+ - config: InputConfig
50
+ The input reader configuration to use.
51
+ - storage: Storage | None
52
+ The storage implementation to use for reading the files.
53
+
54
+ Returns
55
+ -------
56
+ InputReader
57
+ The created input reader implementation.
58
+ """
59
+ config_model = config.model_dump()
60
+ input_strategy = config.type
61
+
62
+ if input_strategy not in input_reader_factory:
63
+ match input_strategy:
64
+ case InputType.Csv:
65
+ from graphrag_input.csv import CSVFileReader
66
+
67
+ register_input_reader(InputType.Csv, CSVFileReader)
68
+ case InputType.Text:
69
+ from graphrag_input.text import TextFileReader
70
+
71
+ register_input_reader(InputType.Text, TextFileReader)
72
+ case InputType.Json:
73
+ from graphrag_input.json import JSONFileReader
74
+
75
+ register_input_reader(InputType.Json, JSONFileReader)
76
+ case InputType.JsonLines:
77
+ from graphrag_input.jsonl import JSONLinesFileReader
78
+
79
+ register_input_reader(InputType.JsonLines, JSONLinesFileReader)
80
+ case InputType.MarkItDown:
81
+ from graphrag_input.markitdown import MarkItDownFileReader
82
+
83
+ register_input_reader(InputType.MarkItDown, MarkItDownFileReader)
84
+ case _:
85
+ msg = f"InputConfig.type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
86
+ raise ValueError(msg)
87
+
88
+ config_model["storage"] = storage
89
+
90
+ return input_reader_factory.create(input_strategy, init_args=config_model)
@@ -0,0 +1,25 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A module containing input file type enum."""
5
+
6
+ from enum import StrEnum
7
+
8
+
9
+ class InputType(StrEnum):
10
+ """The input file type for the pipeline."""
11
+
12
+ Csv = "csv"
13
+ """The CSV input type."""
14
+ Text = "text"
15
+ """The text input type."""
16
+ Json = "json"
17
+ """The JSON input type."""
18
+ JsonLines = "jsonl"
19
+ """The JSON Lines input type."""
20
+ MarkItDown = "markitdown"
21
+ """The MarkItDown input type."""
22
+
23
+ def __repr__(self):
24
+ """Get a string representation."""
25
+ return f'"{self.value}"'
@@ -0,0 +1,38 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A module containing 'JSONFileReader' model."""
5
+
6
+ import json
7
+ import logging
8
+
9
+ from graphrag_input.structured_file_reader import StructuredFileReader
10
+ from graphrag_input.text_document import TextDocument
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class JSONFileReader(StructuredFileReader):
16
+ """Reader implementation for json files."""
17
+
18
+ def __init__(self, file_pattern: str | None = None, **kwargs):
19
+ super().__init__(
20
+ file_pattern=file_pattern if file_pattern is not None else ".*\\.json$",
21
+ **kwargs,
22
+ )
23
+
24
+ async def read_file(self, path: str) -> list[TextDocument]:
25
+ """Read a JSON file into a list of documents.
26
+
27
+ Args:
28
+ - path - The path to read the file from.
29
+
30
+ Returns
31
+ -------
32
+ - output - list with a TextDocument for each row in the file.
33
+ """
34
+ text = await self._storage.get(path, encoding=self._encoding)
35
+ as_json = json.loads(text)
36
+ # json file could just be a single object, or an array of objects
37
+ rows = as_json if isinstance(as_json, list) else [as_json]
38
+ return await self.process_data_columns(rows, path)
@@ -0,0 +1,38 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A module containing 'JSONLinesFileReader' model."""
5
+
6
+ import json
7
+ import logging
8
+
9
+ from graphrag_input.structured_file_reader import StructuredFileReader
10
+ from graphrag_input.text_document import TextDocument
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class JSONLinesFileReader(StructuredFileReader):
16
+ """Reader implementation for json lines files."""
17
+
18
+ def __init__(self, file_pattern: str | None = None, **kwargs):
19
+ super().__init__(
20
+ file_pattern=file_pattern if file_pattern is not None else ".*\\.jsonl$",
21
+ **kwargs,
22
+ )
23
+
24
+ async def read_file(self, path: str) -> list[TextDocument]:
25
+ """Read a JSON lines file into a list of documents.
26
+
27
+ This differs from standard JSON files in that each line is a separate JSON object.
28
+
29
+ Args:
30
+ - path - The path to read the file from.
31
+
32
+ Returns
33
+ -------
34
+ - output - list with a TextDocument for each row in the file.
35
+ """
36
+ text = await self._storage.get(path, encoding=self._encoding)
37
+ rows = [json.loads(line) for line in text.splitlines()]
38
+ return await self.process_data_columns(rows, path)
@@ -0,0 +1,49 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A module containing 'TextFileReader' model."""
5
+
6
+ import logging
7
+ from io import BytesIO
8
+ from pathlib import Path
9
+
10
+ from markitdown import MarkItDown, StreamInfo
11
+
12
+ from graphrag_input.hashing import gen_sha512_hash
13
+ from graphrag_input.input_reader import InputReader
14
+ from graphrag_input.text_document import TextDocument
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class MarkItDownFileReader(InputReader):
20
+ """Reader implementation for any file type supported by markitdown.
21
+
22
+ https://github.com/microsoft/markitdown
23
+ """
24
+
25
+ async def read_file(self, path: str) -> list[TextDocument]:
26
+ """Read a text file into a DataFrame of documents.
27
+
28
+ Args:
29
+ - path - The path to read the file from.
30
+
31
+ Returns
32
+ -------
33
+ - output - list with a TextDocument for each row in the file.
34
+ """
35
+ bytes = await self._storage.get(path, encoding=self._encoding, as_bytes=True)
36
+ md = MarkItDown()
37
+ result = md.convert_stream(
38
+ BytesIO(bytes), stream_info=StreamInfo(extension=Path(path).suffix)
39
+ )
40
+ text = result.markdown
41
+
42
+ document = TextDocument(
43
+ id=gen_sha512_hash({"text": text}, ["text"]),
44
+ title=result.title if result.title else str(Path(path).name),
45
+ text=text,
46
+ creation_date=await self._storage.get_creation_date(path),
47
+ raw_data=None,
48
+ )
49
+ return [document]
@@ -0,0 +1,65 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A module containing 'StructuredFileReader' model."""
5
+
6
+ import logging
7
+ from typing import Any
8
+
9
+ from graphrag_input.get_property import get_property
10
+ from graphrag_input.hashing import gen_sha512_hash
11
+ from graphrag_input.input_reader import InputReader
12
+ from graphrag_input.text_document import TextDocument
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class StructuredFileReader(InputReader):
18
+ """Base reader implementation for structured files such as csv and json."""
19
+
20
+ def __init__(
21
+ self,
22
+ id_column: str | None = None,
23
+ title_column: str | None = None,
24
+ text_column: str = "text",
25
+ **kwargs,
26
+ ):
27
+ super().__init__(**kwargs)
28
+ self._id_column = id_column
29
+ self._title_column = title_column
30
+ self._text_column = text_column
31
+
32
+ async def process_data_columns(
33
+ self,
34
+ rows: list[dict[str, Any]],
35
+ path: str,
36
+ ) -> list[TextDocument]:
37
+ """Process configured data columns from a list of loaded dicts."""
38
+ documents = []
39
+ for index, row in enumerate(rows):
40
+ # text column is required - harvest from dict
41
+ text = get_property(row, self._text_column)
42
+ # id is optional - harvest from dict or hash from text
43
+ id = (
44
+ get_property(row, self._id_column)
45
+ if self._id_column
46
+ else gen_sha512_hash({"text": text}, ["text"])
47
+ )
48
+ # title is optional - harvest from dict or use filename
49
+ num = f" ({index})" if len(rows) > 1 else ""
50
+ title = (
51
+ get_property(row, self._title_column)
52
+ if self._title_column
53
+ else f"{path}{num}"
54
+ )
55
+ creation_date = await self._storage.get_creation_date(path)
56
+ documents.append(
57
+ TextDocument(
58
+ id=id,
59
+ title=title,
60
+ text=text,
61
+ creation_date=creation_date,
62
+ raw_data=row,
63
+ )
64
+ )
65
+ return documents
@@ -0,0 +1,43 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A module containing 'TextFileReader' model."""
5
+
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ from graphrag_input.hashing import gen_sha512_hash
10
+ from graphrag_input.input_reader import InputReader
11
+ from graphrag_input.text_document import TextDocument
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class TextFileReader(InputReader):
17
+ """Reader implementation for text files."""
18
+
19
+ def __init__(self, file_pattern: str | None = None, **kwargs):
20
+ super().__init__(
21
+ file_pattern=file_pattern if file_pattern is not None else ".*\\.txt$",
22
+ **kwargs,
23
+ )
24
+
25
+ async def read_file(self, path: str) -> list[TextDocument]:
26
+ """Read a text file into a list of documents.
27
+
28
+ Args:
29
+ - path - The path to read the file from.
30
+
31
+ Returns
32
+ -------
33
+ - output - list with a TextDocument for each row in the file.
34
+ """
35
+ text = await self._storage.get(path, encoding=self._encoding)
36
+ document = TextDocument(
37
+ id=gen_sha512_hash({"text": text}, ["text"]),
38
+ title=str(Path(path).name),
39
+ text=text,
40
+ creation_date=await self._storage.get_creation_date(path),
41
+ raw_data=None,
42
+ )
43
+ return [document]
@@ -0,0 +1,59 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """TextDocument dataclass."""
5
+
6
+ import logging
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ from graphrag_input.get_property import get_property
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ @dataclass
16
+ class TextDocument:
17
+ """The TextDocument holds relevant content for GraphRAG indexing."""
18
+
19
+ id: str
20
+ """Unique identifier for the document."""
21
+ text: str
22
+ """The main text content of the document."""
23
+ title: str
24
+ """The title of the document."""
25
+ creation_date: str
26
+ """The creation date of the document, ISO-8601 format."""
27
+ raw_data: dict[str, Any] | None = None
28
+ """Raw data from source document."""
29
+
30
+ def get(self, field: str, default_value: Any = None) -> Any:
31
+ """
32
+ Get a single field from the TextDocument.
33
+
34
+ Functions like the get method on a dictionary, returning default_value if the field is not found.
35
+
36
+ Supports nested fields using dot notation.
37
+
38
+ This takes a two step approach for flexibility:
39
+ 1. If the field is one of the standard text document fields (id, title, text, creation_date), just grab it directly. This accommodates unstructured text for example, which just has the standard fields.
40
+ 2. Otherwise. try to extract it from the raw_data dict. This allows users to specify any column from the original input file.
41
+
42
+ """
43
+ if field in ["id", "title", "text", "creation_date"]:
44
+ return getattr(self, field)
45
+
46
+ raw = self.raw_data or {}
47
+ try:
48
+ return get_property(raw, field)
49
+ except KeyError:
50
+ return default_value
51
+
52
+ def collect(self, fields: list[str]) -> dict[str, Any]:
53
+ """Extract data fields from a TextDocument into a dict."""
54
+ data = {}
55
+ for field in fields:
56
+ value = self.get(field)
57
+ if value is not None:
58
+ data[field] = value
59
+ return data
@@ -0,0 +1,44 @@
1
+ [project]
2
+ name = "graphrag-input"
3
+ version = "3.0.0"
4
+ description = "Input document loading utilities for GraphRAG"
5
+ authors = [
6
+ {name = "Alonso Guevara Fernández", email = "alonsog@microsoft.com"},
7
+ {name = "Andrés Morales Esquivel", email = "andresmor@microsoft.com"},
8
+ {name = "Chris Trevino", email = "chtrevin@microsoft.com"},
9
+ {name = "David Tittsworth", email = "datittsw@microsoft.com"},
10
+ {name = "Dayenne de Souza", email = "ddesouza@microsoft.com"},
11
+ {name = "Derek Worthen", email = "deworthe@microsoft.com"},
12
+ {name = "Gaudy Blanco Meneses", email = "gaudyb@microsoft.com"},
13
+ {name = "Ha Trinh", email = "trinhha@microsoft.com"},
14
+ {name = "Jonathan Larson", email = "jolarso@microsoft.com"},
15
+ {name = "Josh Bradley", email = "joshbradley@microsoft.com"},
16
+ {name = "Kate Lytvynets", email = "kalytv@microsoft.com"},
17
+ {name = "Kenny Zhang", email = "zhangken@microsoft.com"},
18
+ {name = "Mónica Carvajal"},
19
+ {name = "Nathan Evans", email = "naevans@microsoft.com"},
20
+ {name = "Rodrigo Racanicci", email = "rracanicci@microsoft.com"},
21
+ {name = "Sarah Smith", email = "smithsarah@microsoft.com"},
22
+ ]
23
+ license = {text = "MIT"}
24
+ readme = "README.md"
25
+ requires-python = ">=3.11,<3.14"
26
+ classifiers = [
27
+ "Programming Language :: Python :: 3",
28
+ "Programming Language :: Python :: 3.11",
29
+ "Programming Language :: Python :: 3.12",
30
+ "Programming Language :: Python :: 3.13",
31
+ ]
32
+ dependencies = [
33
+ "graphrag-common==3.0.0",
34
+ "graphrag-storage==3.0.0 ",
35
+ "pydantic~=2.10",
36
+ "markitdown~=0.1.0"
37
+ ]
38
+
39
+ [project.urls]
40
+ Source = "https://github.com/microsoft/graphrag"
41
+
42
+ [build-system]
43
+ requires = ["hatchling>=1.27.0,<2.0.0"]
44
+ build-backend = "hatchling.build"