graphrag-chunking 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphrag_chunking-3.0.0/.gitignore +65 -0
- graphrag_chunking-3.0.0/LICENSE +21 -0
- graphrag_chunking-3.0.0/PKG-INFO +59 -0
- graphrag_chunking-3.0.0/README.md +41 -0
- graphrag_chunking-3.0.0/graphrag_chunking/__init__.py +4 -0
- graphrag_chunking-3.0.0/graphrag_chunking/bootstrap_nltk.py +31 -0
- graphrag_chunking-3.0.0/graphrag_chunking/chunk_strategy_type.py +13 -0
- graphrag_chunking-3.0.0/graphrag_chunking/chunker.py +24 -0
- graphrag_chunking-3.0.0/graphrag_chunking/chunker_factory.py +77 -0
- graphrag_chunking-3.0.0/graphrag_chunking/chunking_config.py +36 -0
- graphrag_chunking-3.0.0/graphrag_chunking/create_chunk_results.py +32 -0
- graphrag_chunking-3.0.0/graphrag_chunking/sentence_chunker.py +48 -0
- graphrag_chunking-3.0.0/graphrag_chunking/text_chunk.py +29 -0
- graphrag_chunking-3.0.0/graphrag_chunking/token_chunker.py +69 -0
- graphrag_chunking-3.0.0/graphrag_chunking/transformers.py +25 -0
- graphrag_chunking-3.0.0/pyproject.toml +43 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Python Artifacts
|
|
2
|
+
python/*/lib/
|
|
3
|
+
dist/
|
|
4
|
+
build/
|
|
5
|
+
*.egg-info/
|
|
6
|
+
|
|
7
|
+
# Test Output
|
|
8
|
+
.coverage
|
|
9
|
+
coverage/
|
|
10
|
+
licenses.txt
|
|
11
|
+
examples_notebooks/*/data
|
|
12
|
+
tests/fixtures/cache
|
|
13
|
+
tests/fixtures/*/cache
|
|
14
|
+
tests/fixtures/*/output
|
|
15
|
+
output/lancedb
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Random
|
|
19
|
+
.DS_Store
|
|
20
|
+
*.log*
|
|
21
|
+
.venv
|
|
22
|
+
venv/
|
|
23
|
+
.conda
|
|
24
|
+
.tmp
|
|
25
|
+
packages/graphrag-llm/notebooks/metrics
|
|
26
|
+
packages/graphrag-llm/notebooks/cache
|
|
27
|
+
|
|
28
|
+
.env
|
|
29
|
+
build.zip
|
|
30
|
+
|
|
31
|
+
.turbo
|
|
32
|
+
|
|
33
|
+
__pycache__
|
|
34
|
+
|
|
35
|
+
.pipeline
|
|
36
|
+
|
|
37
|
+
# Azurite
|
|
38
|
+
temp_azurite/
|
|
39
|
+
__azurite*.json
|
|
40
|
+
__blobstorage*.json
|
|
41
|
+
__blobstorage__/
|
|
42
|
+
|
|
43
|
+
# Getting started example
|
|
44
|
+
ragtest/
|
|
45
|
+
.ragtest/
|
|
46
|
+
.pipelines
|
|
47
|
+
.pipeline
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# mkdocs
|
|
51
|
+
site/
|
|
52
|
+
|
|
53
|
+
# Docs migration
|
|
54
|
+
docsite/
|
|
55
|
+
.yarn/
|
|
56
|
+
.pnp*
|
|
57
|
+
|
|
58
|
+
# PyCharm
|
|
59
|
+
.idea/
|
|
60
|
+
|
|
61
|
+
# Jupyter notebook
|
|
62
|
+
.ipynb_checkpoints/
|
|
63
|
+
|
|
64
|
+
# Root build assets
|
|
65
|
+
packages/*/LICENSE
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Microsoft Corporation.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graphrag-chunking
|
|
3
|
+
Version: 3.0.0
|
|
4
|
+
Summary: Chunking utilities for GraphRAG
|
|
5
|
+
Project-URL: Source, https://github.com/microsoft/graphrag
|
|
6
|
+
Author: Mónica Carvajal
|
|
7
|
+
Author-email: Alonso Guevara Fernández <alonsog@microsoft.com>, Andrés Morales Esquivel <andresmor@microsoft.com>, Chris Trevino <chtrevin@microsoft.com>, David Tittsworth <datittsw@microsoft.com>, Dayenne de Souza <ddesouza@microsoft.com>, Derek Worthen <deworthe@microsoft.com>, Gaudy Blanco Meneses <gaudyb@microsoft.com>, Ha Trinh <trinhha@microsoft.com>, Jonathan Larson <jolarso@microsoft.com>, Josh Bradley <joshbradley@microsoft.com>, Kate Lytvynets <kalytv@microsoft.com>, Kenny Zhang <zhangken@microsoft.com>, Nathan Evans <naevans@microsoft.com>, Rodrigo Racanicci <rracanicci@microsoft.com>, Sarah Smith <smithsarah@microsoft.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Python: <3.14,>=3.11
|
|
15
|
+
Requires-Dist: graphrag-common==3.0.0
|
|
16
|
+
Requires-Dist: pydantic~=2.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# GraphRAG Chunking
|
|
20
|
+
|
|
21
|
+
This package contains a collection of text chunkers, a core config model, and a factory for acquiring instances.
|
|
22
|
+
|
|
23
|
+
## Examples
|
|
24
|
+
|
|
25
|
+
### Basic sentence chunking with nltk
|
|
26
|
+
|
|
27
|
+
The SentenceChunker class splits text into individual sentences by identifying sentence boundaries. It takes input text and returns a list where each element is a separate sentence, making it easy to process text at the sentence level.
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
chunker = SentenceChunker()
|
|
31
|
+
chunks = chunker.chunk("This is a test. Another sentence.")
|
|
32
|
+
print(chunks) # ["This is a test.", "Another sentence."]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Token chunking
|
|
36
|
+
|
|
37
|
+
The TokenChunker splits text into fixed-size chunks based on token count rather than sentence boundaries. It uses a tokenizer to encode text into tokens, then creates chunks of a specified size with configurable overlap between chunks.
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
tokenizer = tiktoken.get_encoding("o200k_base")
|
|
41
|
+
chunker = TokenChunker(size=3, overlap=0, encode=tokenizer.encode, decode=tokenizer.decode)
|
|
42
|
+
chunks = chunker.chunk("This is a random test fragment of some text")
|
|
43
|
+
print(chunks) # ["This is a", " random test fragment", " of some text"]
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Using the factory via helper util
|
|
47
|
+
|
|
48
|
+
The create_chunker factory function provides a configuration-driven approach to instantiate chunkers by accepting a ChunkingConfig object that specifies the chunking strategy and parameters. This allows for more flexible and maintainable code by separating chunker configuration from direct instantiation.
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
tokenizer = tiktoken.get_encoding("o200k_base")
|
|
52
|
+
config = ChunkingConfig(
|
|
53
|
+
strategy="tokens",
|
|
54
|
+
size=3,
|
|
55
|
+
overlap=0
|
|
56
|
+
)
|
|
57
|
+
chunker = create_chunker(config, tokenizer.encode, tokenizer.decode)
|
|
58
|
+
...
|
|
59
|
+
```
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# GraphRAG Chunking
|
|
2
|
+
|
|
3
|
+
This package contains a collection of text chunkers, a core config model, and a factory for acquiring instances.
|
|
4
|
+
|
|
5
|
+
## Examples
|
|
6
|
+
|
|
7
|
+
### Basic sentence chunking with nltk
|
|
8
|
+
|
|
9
|
+
The SentenceChunker class splits text into individual sentences by identifying sentence boundaries. It takes input text and returns a list where each element is a separate sentence, making it easy to process text at the sentence level.
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
chunker = SentenceChunker()
|
|
13
|
+
chunks = chunker.chunk("This is a test. Another sentence.")
|
|
14
|
+
print(chunks) # ["This is a test.", "Another sentence."]
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
### Token chunking
|
|
18
|
+
|
|
19
|
+
The TokenChunker splits text into fixed-size chunks based on token count rather than sentence boundaries. It uses a tokenizer to encode text into tokens, then creates chunks of a specified size with configurable overlap between chunks.
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
tokenizer = tiktoken.get_encoding("o200k_base")
|
|
23
|
+
chunker = TokenChunker(size=3, overlap=0, encode=tokenizer.encode, decode=tokenizer.decode)
|
|
24
|
+
chunks = chunker.chunk("This is a random test fragment of some text")
|
|
25
|
+
print(chunks) # ["This is a", " random test fragment", " of some text"]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Using the factory via helper util
|
|
29
|
+
|
|
30
|
+
The create_chunker factory function provides a configuration-driven approach to instantiate chunkers by accepting a ChunkingConfig object that specifies the chunking strategy and parameters. This allows for more flexible and maintainable code by separating chunker configuration from direct instantiation.
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
tokenizer = tiktoken.get_encoding("o200k_base")
|
|
34
|
+
config = ChunkingConfig(
|
|
35
|
+
strategy="tokens",
|
|
36
|
+
size=3,
|
|
37
|
+
overlap=0
|
|
38
|
+
)
|
|
39
|
+
chunker = create_chunker(config, tokenizer.encode, tokenizer.decode)
|
|
40
|
+
...
|
|
41
|
+
```
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""Bootstrap definition."""
|
|
5
|
+
|
|
6
|
+
import warnings
|
|
7
|
+
|
|
8
|
+
# Ignore warnings from numba
|
|
9
|
+
warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
|
|
10
|
+
warnings.filterwarnings("ignore", message=".*Use no seed for parallelism.*")
|
|
11
|
+
|
|
12
|
+
initialized_nltk = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def bootstrap():
|
|
16
|
+
"""Bootstrap definition."""
|
|
17
|
+
global initialized_nltk
|
|
18
|
+
if not initialized_nltk:
|
|
19
|
+
import nltk
|
|
20
|
+
from nltk.corpus import wordnet as wn
|
|
21
|
+
|
|
22
|
+
nltk.download("punkt")
|
|
23
|
+
nltk.download("punkt_tab")
|
|
24
|
+
nltk.download("averaged_perceptron_tagger")
|
|
25
|
+
nltk.download("averaged_perceptron_tagger_eng")
|
|
26
|
+
nltk.download("maxent_ne_chunker")
|
|
27
|
+
nltk.download("maxent_ne_chunker_tab")
|
|
28
|
+
nltk.download("words")
|
|
29
|
+
nltk.download("wordnet")
|
|
30
|
+
wn.ensure_loaded()
|
|
31
|
+
initialized_nltk = True
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""Chunk strategy type enumeration."""
|
|
5
|
+
|
|
6
|
+
from enum import StrEnum
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ChunkerType(StrEnum):
|
|
10
|
+
"""ChunkerType class definition."""
|
|
11
|
+
|
|
12
|
+
Tokens = "tokens"
|
|
13
|
+
Sentence = "sentence"
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""A module containing the 'Chunker' class."""
|
|
5
|
+
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from graphrag_chunking.text_chunk import TextChunk
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Chunker(ABC):
|
|
14
|
+
"""Abstract base class for document chunkers."""
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
18
|
+
"""Create a chunker instance."""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def chunk(
|
|
22
|
+
self, text: str, transform: Callable[[str], str] | None = None
|
|
23
|
+
) -> list[TextChunk]:
|
|
24
|
+
"""Chunk method definition."""
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""A module containing 'ChunkerFactory', 'register_chunker', and 'create_chunker'."""
|
|
5
|
+
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
|
|
8
|
+
from graphrag_common.factory.factory import Factory, ServiceScope
|
|
9
|
+
|
|
10
|
+
from graphrag_chunking.chunk_strategy_type import ChunkerType
|
|
11
|
+
from graphrag_chunking.chunker import Chunker
|
|
12
|
+
from graphrag_chunking.chunking_config import ChunkingConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ChunkerFactory(Factory[Chunker]):
|
|
16
|
+
"""Factory for creating Chunker instances."""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
chunker_factory = ChunkerFactory()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def register_chunker(
|
|
23
|
+
chunker_type: str,
|
|
24
|
+
chunker_initializer: Callable[..., Chunker],
|
|
25
|
+
scope: ServiceScope = "transient",
|
|
26
|
+
) -> None:
|
|
27
|
+
"""Register a custom chunker implementation.
|
|
28
|
+
|
|
29
|
+
Args
|
|
30
|
+
----
|
|
31
|
+
- chunker_type: str
|
|
32
|
+
The chunker id to register.
|
|
33
|
+
- chunker_initializer: Callable[..., Chunker]
|
|
34
|
+
The chunker initializer to register.
|
|
35
|
+
"""
|
|
36
|
+
chunker_factory.register(chunker_type, chunker_initializer, scope)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def create_chunker(
|
|
40
|
+
config: ChunkingConfig,
|
|
41
|
+
encode: Callable[[str], list[int]] | None = None,
|
|
42
|
+
decode: Callable[[list[int]], str] | None = None,
|
|
43
|
+
) -> Chunker:
|
|
44
|
+
"""Create a chunker implementation based on the given configuration.
|
|
45
|
+
|
|
46
|
+
Args
|
|
47
|
+
----
|
|
48
|
+
- config: ChunkingConfig
|
|
49
|
+
The chunker configuration to use.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
Chunker
|
|
54
|
+
The created chunker implementation.
|
|
55
|
+
"""
|
|
56
|
+
config_model = config.model_dump()
|
|
57
|
+
if encode is not None:
|
|
58
|
+
config_model["encode"] = encode
|
|
59
|
+
if decode is not None:
|
|
60
|
+
config_model["decode"] = decode
|
|
61
|
+
chunker_strategy = config.type
|
|
62
|
+
|
|
63
|
+
if chunker_strategy not in chunker_factory:
|
|
64
|
+
match chunker_strategy:
|
|
65
|
+
case ChunkerType.Tokens:
|
|
66
|
+
from graphrag_chunking.token_chunker import TokenChunker
|
|
67
|
+
|
|
68
|
+
register_chunker(ChunkerType.Tokens, TokenChunker)
|
|
69
|
+
case ChunkerType.Sentence:
|
|
70
|
+
from graphrag_chunking.sentence_chunker import SentenceChunker
|
|
71
|
+
|
|
72
|
+
register_chunker(ChunkerType.Sentence, SentenceChunker)
|
|
73
|
+
case _:
|
|
74
|
+
msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
|
|
75
|
+
raise ValueError(msg)
|
|
76
|
+
|
|
77
|
+
return chunker_factory.create(chunker_strategy, init_args=config_model)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""Parameterization settings for the default configuration."""
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
7
|
+
|
|
8
|
+
from graphrag_chunking.chunk_strategy_type import ChunkerType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ChunkingConfig(BaseModel):
|
|
12
|
+
"""Configuration section for chunking."""
|
|
13
|
+
|
|
14
|
+
model_config = ConfigDict(extra="allow")
|
|
15
|
+
"""Allow extra fields to support custom cache implementations."""
|
|
16
|
+
|
|
17
|
+
type: str = Field(
|
|
18
|
+
description="The chunking type to use.",
|
|
19
|
+
default=ChunkerType.Tokens,
|
|
20
|
+
)
|
|
21
|
+
encoding_model: str | None = Field(
|
|
22
|
+
description="The encoding model to use.",
|
|
23
|
+
default=None,
|
|
24
|
+
)
|
|
25
|
+
size: int = Field(
|
|
26
|
+
description="The chunk size to use.",
|
|
27
|
+
default=1200,
|
|
28
|
+
)
|
|
29
|
+
overlap: int = Field(
|
|
30
|
+
description="The chunk overlap to use.",
|
|
31
|
+
default=100,
|
|
32
|
+
)
|
|
33
|
+
prepend_metadata: list[str] | None = Field(
|
|
34
|
+
description="Metadata fields from the source document to prepend on each chunk.",
|
|
35
|
+
default=None,
|
|
36
|
+
)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""A module containing 'create_chunk_results' function."""
|
|
5
|
+
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
|
|
8
|
+
from graphrag_chunking.text_chunk import TextChunk
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def create_chunk_results(
|
|
12
|
+
chunks: list[str],
|
|
13
|
+
transform: Callable[[str], str] | None = None,
|
|
14
|
+
encode: Callable[[str], list[int]] | None = None,
|
|
15
|
+
) -> list[TextChunk]:
|
|
16
|
+
"""Create chunk results from a list of text chunks. The index assignments are 0-based and assume chunks were not stripped relative to the source text."""
|
|
17
|
+
results = []
|
|
18
|
+
start_char = 0
|
|
19
|
+
for index, chunk in enumerate(chunks):
|
|
20
|
+
end_char = start_char + len(chunk) - 1 # 0-based indices
|
|
21
|
+
result = TextChunk(
|
|
22
|
+
original=chunk,
|
|
23
|
+
text=transform(chunk) if transform else chunk,
|
|
24
|
+
index=index,
|
|
25
|
+
start_char=start_char,
|
|
26
|
+
end_char=end_char,
|
|
27
|
+
)
|
|
28
|
+
if encode:
|
|
29
|
+
result.token_count = len(encode(result.text))
|
|
30
|
+
results.append(result)
|
|
31
|
+
start_char = end_char + 1
|
|
32
|
+
return results
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""A module containing 'SentenceChunker' class."""
|
|
5
|
+
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import nltk
|
|
10
|
+
|
|
11
|
+
from graphrag_chunking.bootstrap_nltk import bootstrap
|
|
12
|
+
from graphrag_chunking.chunker import Chunker
|
|
13
|
+
from graphrag_chunking.create_chunk_results import create_chunk_results
|
|
14
|
+
from graphrag_chunking.text_chunk import TextChunk
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SentenceChunker(Chunker):
|
|
18
|
+
"""A chunker that splits text into sentence-based chunks."""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self, encode: Callable[[str], list[int]] | None = None, **kwargs: Any
|
|
22
|
+
) -> None:
|
|
23
|
+
"""Create a sentence chunker instance."""
|
|
24
|
+
self._encode = encode
|
|
25
|
+
bootstrap()
|
|
26
|
+
|
|
27
|
+
def chunk(
|
|
28
|
+
self, text: str, transform: Callable[[str], str] | None = None
|
|
29
|
+
) -> list[TextChunk]:
|
|
30
|
+
"""Chunk the text into sentence-based chunks."""
|
|
31
|
+
sentences = nltk.sent_tokenize(text.strip())
|
|
32
|
+
results = create_chunk_results(
|
|
33
|
+
sentences, transform=transform, encode=self._encode
|
|
34
|
+
)
|
|
35
|
+
# nltk sentence tokenizer may trim whitespace, so we need to adjust start/end chars
|
|
36
|
+
for index, result in enumerate(results):
|
|
37
|
+
txt = result.text
|
|
38
|
+
start = result.start_char
|
|
39
|
+
actual_start = text.find(txt, start)
|
|
40
|
+
delta = actual_start - start
|
|
41
|
+
if delta > 0:
|
|
42
|
+
result.start_char += delta
|
|
43
|
+
result.end_char += delta
|
|
44
|
+
# bump the next to keep the start check from falling too far behind
|
|
45
|
+
if index < len(results) - 1:
|
|
46
|
+
results[index + 1].start_char += delta
|
|
47
|
+
results[index + 1].end_char += delta
|
|
48
|
+
return results
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""The TextChunk dataclass."""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class TextChunk:
|
|
11
|
+
"""Result of chunking a document."""
|
|
12
|
+
|
|
13
|
+
original: str
|
|
14
|
+
"""Raw original text chunk before any transformation."""
|
|
15
|
+
|
|
16
|
+
text: str
|
|
17
|
+
"""The final text content of this chunk."""
|
|
18
|
+
|
|
19
|
+
index: int
|
|
20
|
+
"""Zero-based index of this chunk within the source document."""
|
|
21
|
+
|
|
22
|
+
start_char: int
|
|
23
|
+
"""Character index where the raw chunk text begins in the source document."""
|
|
24
|
+
|
|
25
|
+
end_char: int
|
|
26
|
+
"""Character index where the raw chunk text ends in the source document."""
|
|
27
|
+
|
|
28
|
+
token_count: int | None = None
|
|
29
|
+
"""Number of tokens in the final chunk text, if computed."""
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""A module containing 'TokenChunker' class."""
|
|
5
|
+
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from graphrag_chunking.chunker import Chunker
|
|
10
|
+
from graphrag_chunking.create_chunk_results import create_chunk_results
|
|
11
|
+
from graphrag_chunking.text_chunk import TextChunk
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TokenChunker(Chunker):
|
|
15
|
+
"""A chunker that splits text into token-based chunks."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
size: int,
|
|
20
|
+
overlap: int,
|
|
21
|
+
encode: Callable[[str], list[int]],
|
|
22
|
+
decode: Callable[[list[int]], str],
|
|
23
|
+
**kwargs: Any,
|
|
24
|
+
) -> None:
|
|
25
|
+
"""Create a token chunker instance."""
|
|
26
|
+
self._size = size
|
|
27
|
+
self._overlap = overlap
|
|
28
|
+
self._encode = encode
|
|
29
|
+
self._decode = decode
|
|
30
|
+
|
|
31
|
+
def chunk(
|
|
32
|
+
self, text: str, transform: Callable[[str], str] | None = None
|
|
33
|
+
) -> list[TextChunk]:
|
|
34
|
+
"""Chunk the text into token-based chunks."""
|
|
35
|
+
chunks = split_text_on_tokens(
|
|
36
|
+
text,
|
|
37
|
+
chunk_size=self._size,
|
|
38
|
+
chunk_overlap=self._overlap,
|
|
39
|
+
encode=self._encode,
|
|
40
|
+
decode=self._decode,
|
|
41
|
+
)
|
|
42
|
+
return create_chunk_results(chunks, transform=transform, encode=self._encode)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def split_text_on_tokens(
|
|
46
|
+
text: str,
|
|
47
|
+
chunk_size: int,
|
|
48
|
+
chunk_overlap: int,
|
|
49
|
+
encode: Callable[[str], list[int]],
|
|
50
|
+
decode: Callable[[list[int]], str],
|
|
51
|
+
) -> list[str]:
|
|
52
|
+
"""Split a single text and return chunks using the tokenizer."""
|
|
53
|
+
result = []
|
|
54
|
+
input_tokens = encode(text)
|
|
55
|
+
|
|
56
|
+
start_idx = 0
|
|
57
|
+
cur_idx = min(start_idx + chunk_size, len(input_tokens))
|
|
58
|
+
chunk_tokens = input_tokens[start_idx:cur_idx]
|
|
59
|
+
|
|
60
|
+
while start_idx < len(input_tokens):
|
|
61
|
+
chunk_text = decode(list(chunk_tokens))
|
|
62
|
+
result.append(chunk_text) # Append chunked text as string
|
|
63
|
+
if cur_idx == len(input_tokens):
|
|
64
|
+
break
|
|
65
|
+
start_idx += chunk_size - chunk_overlap
|
|
66
|
+
cur_idx = min(start_idx + chunk_size, len(input_tokens))
|
|
67
|
+
chunk_tokens = input_tokens[start_idx:cur_idx]
|
|
68
|
+
|
|
69
|
+
return result
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""A collection of useful built-in transformers you can use for chunking."""
|
|
5
|
+
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def add_metadata(
|
|
11
|
+
metadata: dict[str, Any],
|
|
12
|
+
delimiter: str = ": ",
|
|
13
|
+
line_delimiter: str = "\n",
|
|
14
|
+
append: bool = False,
|
|
15
|
+
) -> Callable[[str], str]:
|
|
16
|
+
"""Add metadata to the given text, prepending by default. This utility writes the dict as rows of key/value pairs."""
|
|
17
|
+
|
|
18
|
+
def transformer(text: str) -> str:
|
|
19
|
+
metadata_str = (
|
|
20
|
+
line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
|
|
21
|
+
+ line_delimiter
|
|
22
|
+
)
|
|
23
|
+
return text + metadata_str if append else metadata_str + text
|
|
24
|
+
|
|
25
|
+
return transformer
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "graphrag-chunking"
|
|
3
|
+
version = "3.0.0"
|
|
4
|
+
description = "Chunking utilities for GraphRAG"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Alonso Guevara Fernández", email = "alonsog@microsoft.com"},
|
|
7
|
+
{name = "Andrés Morales Esquivel", email = "andresmor@microsoft.com"},
|
|
8
|
+
{name = "Chris Trevino", email = "chtrevin@microsoft.com"},
|
|
9
|
+
{name = "David Tittsworth", email = "datittsw@microsoft.com"},
|
|
10
|
+
{name = "Dayenne de Souza", email = "ddesouza@microsoft.com"},
|
|
11
|
+
{name = "Derek Worthen", email = "deworthe@microsoft.com"},
|
|
12
|
+
{name = "Gaudy Blanco Meneses", email = "gaudyb@microsoft.com"},
|
|
13
|
+
{name = "Ha Trinh", email = "trinhha@microsoft.com"},
|
|
14
|
+
{name = "Jonathan Larson", email = "jolarso@microsoft.com"},
|
|
15
|
+
{name = "Josh Bradley", email = "joshbradley@microsoft.com"},
|
|
16
|
+
{name = "Kate Lytvynets", email = "kalytv@microsoft.com"},
|
|
17
|
+
{name = "Kenny Zhang", email = "zhangken@microsoft.com"},
|
|
18
|
+
{name = "Mónica Carvajal"},
|
|
19
|
+
{name = "Nathan Evans", email = "naevans@microsoft.com"},
|
|
20
|
+
{name = "Rodrigo Racanicci", email = "rracanicci@microsoft.com"},
|
|
21
|
+
{name = "Sarah Smith", email = "smithsarah@microsoft.com"},
|
|
22
|
+
]
|
|
23
|
+
license = {text = "MIT"}
|
|
24
|
+
readme = "README.md"
|
|
25
|
+
requires-python = ">=3.11,<3.14"
|
|
26
|
+
classifiers = [
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Programming Language :: Python :: 3.13",
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
"graphrag-common==3.0.0",
|
|
34
|
+
"pydantic~=2.10",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Source = "https://github.com/microsoft/graphrag"
|
|
39
|
+
|
|
40
|
+
[build-system]
|
|
41
|
+
requires = ["hatchling>=1.27.0,<2.0.0"]
|
|
42
|
+
build-backend = "hatchling.build"
|
|
43
|
+
|