ragbits-document-search 1.4.0.dev202601310254__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. ragbits/document_search/__init__.py +3 -0
  2. ragbits/document_search/_main.py +273 -0
  3. ragbits/document_search/cli.py +109 -0
  4. ragbits/document_search/documents/__init__.py +0 -0
  5. ragbits/document_search/documents/document.py +203 -0
  6. ragbits/document_search/documents/element.py +208 -0
  7. ragbits/document_search/ingestion/__init__.py +0 -0
  8. ragbits/document_search/ingestion/enrichers/__init__.py +5 -0
  9. ragbits/document_search/ingestion/enrichers/base.py +64 -0
  10. ragbits/document_search/ingestion/enrichers/exceptions.py +32 -0
  11. ragbits/document_search/ingestion/enrichers/image.py +107 -0
  12. ragbits/document_search/ingestion/enrichers/router.py +86 -0
  13. ragbits/document_search/ingestion/parsers/__init__.py +9 -0
  14. ragbits/document_search/ingestion/parsers/base.py +97 -0
  15. ragbits/document_search/ingestion/parsers/docling.py +178 -0
  16. ragbits/document_search/ingestion/parsers/exceptions.py +32 -0
  17. ragbits/document_search/ingestion/parsers/pptx/__init__.py +28 -0
  18. ragbits/document_search/ingestion/parsers/pptx/callbacks.py +32 -0
  19. ragbits/document_search/ingestion/parsers/pptx/exceptions.py +52 -0
  20. ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py +84 -0
  21. ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py +78 -0
  22. ragbits/document_search/ingestion/parsers/pptx/parser.py +85 -0
  23. ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py +75 -0
  24. ragbits/document_search/ingestion/parsers/router.py +90 -0
  25. ragbits/document_search/ingestion/parsers/unstructured.py +248 -0
  26. ragbits/document_search/ingestion/strategies/__init__.py +6 -0
  27. ragbits/document_search/ingestion/strategies/base.py +290 -0
  28. ragbits/document_search/ingestion/strategies/batched.py +261 -0
  29. ragbits/document_search/ingestion/strategies/ray.py +138 -0
  30. ragbits/document_search/ingestion/strategies/sequential.py +23 -0
  31. ragbits/document_search/py.typed +0 -0
  32. ragbits/document_search/retrieval/__init__.py +0 -0
  33. ragbits/document_search/retrieval/rephrasers/__init__.py +18 -0
  34. ragbits/document_search/retrieval/rephrasers/base.py +39 -0
  35. ragbits/document_search/retrieval/rephrasers/llm.py +141 -0
  36. ragbits/document_search/retrieval/rephrasers/noop.py +26 -0
  37. ragbits/document_search/retrieval/rerankers/__init__.py +4 -0
  38. ragbits/document_search/retrieval/rerankers/answerai.py +82 -0
  39. ragbits/document_search/retrieval/rerankers/base.py +56 -0
  40. ragbits/document_search/retrieval/rerankers/litellm.py +85 -0
  41. ragbits/document_search/retrieval/rerankers/llm.py +177 -0
  42. ragbits/document_search/retrieval/rerankers/noop.py +34 -0
  43. ragbits/document_search/retrieval/rerankers/rrf.py +73 -0
  44. ragbits_document_search-1.4.0.dev202601310254.dist-info/METADATA +85 -0
  45. ragbits_document_search-1.4.0.dev202601310254.dist-info/RECORD +46 -0
  46. ragbits_document_search-1.4.0.dev202601310254.dist-info/WHEEL +4 -0
@@ -0,0 +1,208 @@
1
+ import hashlib
2
+ import uuid
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, ClassVar
5
+
6
+ from pydantic import BaseModel, computed_field
7
+
8
+ from ragbits.core.utils.pydantic import SerializableBytes
9
+ from ragbits.core.vector_stores.base import VectorStoreEntry
10
+ from ragbits.document_search.documents.document import DocumentMeta
11
+
12
+
13
+ class ElementLocation(BaseModel):
14
+ """
15
+ An object representing position of chunk within document.
16
+ """
17
+
18
+ page_number: int | None = None
19
+ coordinates: dict | None = None
20
+
21
+
22
+ class Element(BaseModel, ABC):
23
+ """
24
+ An object representing an element in a document.
25
+ """
26
+
27
+ element_type: str
28
+ document_meta: DocumentMeta
29
+ location: ElementLocation | None = None
30
+ score: float | None = None
31
+
32
+ _elements_registry: ClassVar[dict[str, type["Element"]]] = {}
33
+
34
+ @property
35
+ def id(self) -> str:
36
+ """
37
+ Retrieve the ID of the element, primarily used to represent the element's data.
38
+
39
+ Returns:
40
+ str: string representing element
41
+ """
42
+ id_components = self.get_id_components()
43
+ return "&".join(f"{k}={v}" for k, v in id_components.items())
44
+
45
+ def get_id_components(self) -> dict[str, str]:
46
+ """
47
+ Creates a dictionary of key value pairs of id components
48
+
49
+ Returns:
50
+ dict: a dictionary
51
+ """
52
+ id_components = {
53
+ "meta": self.document_meta.id,
54
+ "type": self.element_type,
55
+ "key": str(self.key),
56
+ "text": str(self.text_representation),
57
+ "location": str(self.location),
58
+ }
59
+ return id_components
60
+
61
+ @computed_field # type: ignore[prop-decorator]
62
+ @property
63
+ def key(self) -> str | None:
64
+ """
65
+ Get the representation of the element for embedding.
66
+
67
+ Returns:
68
+ The representation for embedding.
69
+ """
70
+ return self.text_representation
71
+
72
+ @computed_field # type: ignore[prop-decorator]
73
+ @property
74
+ @abstractmethod
75
+ def text_representation(self) -> str | None:
76
+ """
77
+ Get the text representation of the element.
78
+
79
+ Returns:
80
+ The text representation.
81
+ """
82
+
83
+ @property
84
+ def image_representation(self) -> bytes | None:
85
+ """
86
+ Get the image representation of the element.
87
+
88
+ Returns:
89
+ The image representation.
90
+ """
91
+ return None
92
+
93
+ @classmethod
94
+ def __pydantic_init_subclass__(cls, **kwargs: Any) -> None: # noqa: ANN401
95
+ element_type_default = cls.model_fields["element_type"].default
96
+ if element_type_default is None:
97
+ raise ValueError("Element type must be defined")
98
+ Element._elements_registry[element_type_default] = cls
99
+
100
+ @classmethod
101
+ def from_vector_db_entry(cls, db_entry: VectorStoreEntry, score: float | None = None) -> "Element":
102
+ """
103
+ Create an element from a vector database entry.
104
+
105
+ Args:
106
+ db_entry: The vector database entry.
107
+ score: The score of the element retrieved from the vector database or reranker.
108
+
109
+ Returns:
110
+ The element.
111
+ """
112
+ element_type = db_entry.metadata["element_type"]
113
+ element_cls = Element._elements_registry[element_type]
114
+ if "embedding_type" in db_entry.metadata:
115
+ del db_entry.metadata["embedding_type"]
116
+
117
+ element = element_cls(**db_entry.metadata)
118
+ element.score = score
119
+ return element
120
+
121
+ def to_vector_db_entry(self) -> VectorStoreEntry:
122
+ """
123
+ Create a vector database entry from the element.
124
+
125
+ Returns:
126
+ The vector database entry
127
+ """
128
+ id_components = [
129
+ self.id,
130
+ ]
131
+ vector_store_entry_id = uuid.uuid5(uuid.NAMESPACE_OID, ";".join(id_components))
132
+ metadata = self.model_dump(exclude={"id", "key"})
133
+ metadata["document_meta"]["source"]["id"] = self.document_meta.source.id
134
+
135
+ return VectorStoreEntry(
136
+ id=vector_store_entry_id, text=self.key, image_bytes=self.image_representation, metadata=metadata
137
+ )
138
+
139
+
140
+ class TextElement(Element):
141
+ """
142
+ An object representing a text element in a document.
143
+ """
144
+
145
+ element_type: str = "text"
146
+ content: str
147
+
148
+ @computed_field # type: ignore[prop-decorator]
149
+ @property
150
+ def text_representation(self) -> str:
151
+ """
152
+ Get the text representation of the element.
153
+
154
+ Returns:
155
+ The text representation.
156
+ """
157
+ return self.content
158
+
159
+
160
+ class ImageElement(Element):
161
+ """
162
+ An object representing an image element in a document.
163
+ """
164
+
165
+ element_type: str = "image"
166
+ image_bytes: SerializableBytes
167
+ description: str | None = None
168
+ ocr_extracted_text: str | None = None
169
+
170
+ @computed_field # type: ignore[prop-decorator]
171
+ @property
172
+ def text_representation(self) -> str | None:
173
+ """
174
+ Get the text representation of the element.
175
+
176
+ Returns:
177
+ The text representation.
178
+ """
179
+ if not self.description and not self.ocr_extracted_text:
180
+ return None
181
+
182
+ repr = ""
183
+ if self.description:
184
+ repr += f"Description: {self.description}\n"
185
+ if self.ocr_extracted_text:
186
+ repr += f"Extracted text: {self.ocr_extracted_text}"
187
+ return repr
188
+
189
+ @property
190
+ def image_representation(self) -> bytes:
191
+ """
192
+ Get the image representation of the element.
193
+
194
+ Returns:
195
+ The image representation.
196
+ """
197
+ return self.image_bytes
198
+
199
+ def get_id_components(self) -> dict[str, str]:
200
+ """
201
+ Creates a dictionary of key value pairs of id components
202
+
203
+ Returns:
204
+ dict: a dictionary
205
+ """
206
+ id_components = super().get_id_components()
207
+ id_components["image_hash"] = hashlib.sha256(self.image_bytes).hexdigest()
208
+ return id_components
File without changes
@@ -0,0 +1,5 @@
1
+ from ragbits.document_search.ingestion.enrichers.base import ElementEnricher
2
+ from ragbits.document_search.ingestion.enrichers.image import ImageElementEnricher
3
+ from ragbits.document_search.ingestion.enrichers.router import ElementEnricherRouter
4
+
5
+ __all__ = ["ElementEnricher", "ElementEnricherRouter", "ImageElementEnricher"]
@@ -0,0 +1,64 @@
1
+ from abc import ABC, abstractmethod
2
+ from types import ModuleType, UnionType
3
+ from typing import ClassVar, Generic, TypeVar, get_args, get_origin
4
+
5
+ from ragbits.core.utils.config_handling import WithConstructionConfig
6
+ from ragbits.document_search.documents.element import Element
7
+ from ragbits.document_search.ingestion import enrichers
8
+ from ragbits.document_search.ingestion.enrichers.exceptions import EnricherElementNotSupportedError
9
+
10
+ ElementT = TypeVar("ElementT", bound=Element)
11
+
12
+
13
+ class ElementEnricher(Generic[ElementT], WithConstructionConfig, ABC):
14
+ """
15
+ Base class for element enrichers, responsible for providing additional information about elements.
16
+
17
+ Enrichers operate on raw elements and are used to fill in missing fields that could not be filled in during parsing.
18
+ They usually deal with summarizing text or describing images.
19
+ """
20
+
21
+ default_module: ClassVar[ModuleType | None] = enrichers
22
+ configuration_key: ClassVar[str] = "enricher"
23
+
24
+ @abstractmethod
25
+ async def enrich(self, elements: list[ElementT]) -> list[ElementT]:
26
+ """
27
+ Enrich elements.
28
+
29
+ Args:
30
+ elements: The elements to be enriched.
31
+
32
+ Returns:
33
+ The list of enriched elements.
34
+
35
+ Raises:
36
+ EnricherError: If the enrichment of the elements failed.
37
+ """
38
+
39
+ @classmethod
40
+ def validate_element_type(cls, element_type: type[Element]) -> None:
41
+ """
42
+ Check if the enricher supports the element type.
43
+
44
+ Args:
45
+ element_type: The element type to validate against the enricher.
46
+
47
+ Raises:
48
+ EnricherElementNotSupportedError: If the element type is not supported.
49
+ """
50
+ expected_element_type = cls.__orig_bases__[0].__args__[0] # type: ignore
51
+
52
+ # Check if expected_element_type is a Union and if element_type is in that Union
53
+ if (
54
+ (origin := get_origin(expected_element_type))
55
+ and origin == UnionType
56
+ and element_type in get_args(expected_element_type)
57
+ ):
58
+ return
59
+
60
+ # Check if element_type matches expected_element_type exactly
61
+ if element_type == expected_element_type:
62
+ return
63
+
64
+ raise EnricherElementNotSupportedError(enricher_name=cls.__name__, element_type=element_type)
@@ -0,0 +1,32 @@
1
+ from ragbits.document_search.documents.element import Element
2
+
3
+
4
+ class EnricherError(Exception):
5
+ """
6
+ Class for all exceptions raised by the element enricher and router.
7
+ """
8
+
9
+ def __init__(self, message: str) -> None:
10
+ super().__init__(message)
11
+ self.message = message
12
+
13
+
14
+ class EnricherNotFoundError(EnricherError):
15
+ """
16
+ Raised when no enricher was found for the element type.
17
+ """
18
+
19
+ def __init__(self, element_type: type[Element]) -> None:
20
+ super().__init__(f"No enricher found for the element type {element_type}")
21
+ self.element_type = element_type
22
+
23
+
24
+ class EnricherElementNotSupportedError(EnricherError):
25
+ """
26
+ Raised when the element type is not supported by the enricher.
27
+ """
28
+
29
+ def __init__(self, enricher_name: str, element_type: type[Element]) -> None:
30
+ super().__init__(f"Element type {element_type} is not supported by the {enricher_name}")
31
+ self.enricher_name = enricher_name
32
+ self.element_type = element_type
@@ -0,0 +1,107 @@
1
+ from pydantic import BaseModel
2
+
3
+ from ragbits.core.llms.base import LLM, LLMType
4
+ from ragbits.core.llms.factory import get_preferred_llm
5
+ from ragbits.core.prompt import Attachment, Prompt
6
+ from ragbits.core.utils.config_handling import ObjectConstructionConfig, import_by_path
7
+ from ragbits.document_search.documents.element import ImageElement
8
+ from ragbits.document_search.ingestion.enrichers.base import ElementEnricher
9
+
10
+
11
+ class ImageDescriberInput(BaseModel):
12
+ """
13
+ Input data for an image describer prompt.
14
+ """
15
+
16
+ image: Attachment
17
+
18
+
19
+ class ImageDescriberOutput(BaseModel):
20
+ """
21
+ Output data for an image describer prompt.
22
+ """
23
+
24
+ description: str
25
+
26
+
27
+ class ImageDescriberPrompt(Prompt[ImageDescriberInput, ImageDescriberOutput]):
28
+ """
29
+ Prompt for describing image elements using LLM.
30
+ """
31
+
32
+ user_prompt = "Describe the content of the image."
33
+
34
+
35
+ class ImageElementEnricher(ElementEnricher[ImageElement]):
36
+ """
37
+ Enricher that describes image elements using LLM.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ llm: LLM | None = None,
43
+ prompt: type[Prompt[ImageDescriberInput, ImageDescriberOutput]] | None = None,
44
+ ) -> None:
45
+ """
46
+ Initialize the ImageElementEnricher instance.
47
+
48
+ Args:
49
+ llm: The language model to use for describing images.
50
+ prompt: The prompt class to use.
51
+ """
52
+ self._llm = llm or get_preferred_llm(llm_type=LLMType.VISION)
53
+ self._prompt = prompt or ImageDescriberPrompt
54
+
55
+ async def enrich(self, elements: list[ImageElement]) -> list[ImageElement]:
56
+ """
57
+ Enrich image elements with additional description of the image.
58
+
59
+ Args:
60
+ elements: The elements to be enriched.
61
+
62
+ Returns:
63
+ The list of enriched elements.
64
+
65
+ Raises:
66
+ EnricherElementNotSupportedError: If the element type is not supported.
67
+ LLMError: If LLM generation fails.
68
+ """
69
+ responses: list[ImageDescriberOutput] = []
70
+ for element in elements:
71
+ self.validate_element_type(type(element))
72
+ image = Attachment(data=element.image_bytes)
73
+ prompt = self._prompt(ImageDescriberInput(image=image))
74
+ responses.append(await self._llm.generate(prompt))
75
+
76
+ return [
77
+ ImageElement(
78
+ document_meta=element.document_meta,
79
+ description=response.description,
80
+ image_bytes=element.image_bytes,
81
+ ocr_extracted_text=element.ocr_extracted_text,
82
+ )
83
+ for element, response in zip(elements, responses, strict=True)
84
+ ]
85
+
86
+ @classmethod
87
+ def from_config(cls, config: dict) -> "ImageElementEnricher":
88
+ """
89
+ Create an `ImageElementEnricher` instance from a configuration dictionary.
90
+
91
+ Args:
92
+ config: The dictionary containing the configuration settings.
93
+
94
+ Returns:
95
+ The initialized instance of `ImageElementEnricher`.
96
+
97
+ Raises:
98
+ ValidationError: If the configuration doesn't follow the expected format.
99
+ InvalidConfigError: If llm or prompt can't be found or are not the correct type.
100
+ """
101
+ config["llm"] = (
102
+ LLM.subclass_from_config(ObjectConstructionConfig.model_validate(config["llm"]))
103
+ if "llm" in config
104
+ else None
105
+ )
106
+ config["prompt"] = import_by_path(config["prompt"]) if "prompt" in config else None
107
+ return super().from_config(config)
@@ -0,0 +1,86 @@
1
+ from collections.abc import Mapping
2
+ from typing import ClassVar
3
+
4
+ from typing_extensions import Self
5
+
6
+ from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig, import_by_path
7
+ from ragbits.document_search.documents import element
8
+ from ragbits.document_search.documents.element import Element
9
+ from ragbits.document_search.ingestion.enrichers.base import ElementEnricher
10
+ from ragbits.document_search.ingestion.enrichers.exceptions import EnricherNotFoundError
11
+
12
+ _DEFAULT_ENRICHERS: dict[type[Element], ElementEnricher] = {}
13
+
14
+
15
+ class ElementEnricherRouter(WithConstructionConfig):
16
+ """
17
+ The class responsible for routing the element to the correct enricher based on the element type.
18
+ """
19
+
20
+ configuration_key: ClassVar[str] = "enricher_router"
21
+
22
+ _enrichers: Mapping[type[Element], ElementEnricher]
23
+
24
+ def __init__(
25
+ self,
26
+ enrichers: Mapping[type[Element], ElementEnricher] | None = None,
27
+ ) -> None:
28
+ """
29
+ Initialize the ElementEnricherRouter instance.
30
+
31
+ Args:
32
+ enrichers: The mapping of element types and their enrichers. To override default enrichers.
33
+ """
34
+ self._enrichers = {**_DEFAULT_ENRICHERS, **enrichers} if enrichers else _DEFAULT_ENRICHERS
35
+
36
+ def __contains__(self, element_type: type[Element]) -> bool:
37
+ """
38
+ Check if there is an enricher defined of the given element type.
39
+
40
+ Args:
41
+ element_type: The element type.
42
+
43
+ Returns:
44
+ True if the enricher is defined for the element, otherwise False.
45
+ """
46
+ return element_type in self._enrichers
47
+
48
+ @classmethod
49
+ def from_config(cls, config: dict[str, ObjectConstructionConfig]) -> Self:
50
+ """
51
+ Initialize the class with the provided configuration.
52
+
53
+ Args:
54
+ config: A dictionary containing configuration details for the class.
55
+
56
+ Returns:
57
+ The ElementEnricherRouter.
58
+
59
+ Raises:
60
+ InvalidConfigError: If any of the provided parsers cannot be initialized.
61
+ """
62
+ enrichers: dict[type[Element], ElementEnricher] = {
63
+ import_by_path(element_type, element): ElementEnricher.subclass_from_config(enricher_config)
64
+ for element_type, enricher_config in config.items()
65
+ }
66
+ return super().from_config({"enrichers": enrichers})
67
+
68
+ def get(self, element_type: type[Element]) -> ElementEnricher:
69
+ """
70
+ Get the enricher for the element.
71
+
72
+ Args:
73
+ element_type: The element type.
74
+
75
+ Returns:
76
+ The enricher for processing the element.
77
+
78
+ Raises:
79
+ EnricherNotFoundError: If no enricher is found for the element type.
80
+ """
81
+ enricher = self._enrichers.get(element_type)
82
+
83
+ if isinstance(enricher, ElementEnricher):
84
+ return enricher
85
+
86
+ raise EnricherNotFoundError(element_type)
@@ -0,0 +1,9 @@
1
+ from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
2
+ from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
3
+
4
+ __all__ = [
5
+ "DocumentParser",
6
+ "DocumentParserRouter",
7
+ "ImageDocumentParser",
8
+ "TextDocumentParser",
9
+ ]
@@ -0,0 +1,97 @@
1
+ from abc import ABC, abstractmethod
2
+ from types import ModuleType
3
+ from typing import ClassVar
4
+
5
+ from ragbits.core.utils.config_handling import WithConstructionConfig
6
+ from ragbits.document_search.documents.document import Document, DocumentType
7
+ from ragbits.document_search.documents.element import Element, ImageElement, TextElement
8
+ from ragbits.document_search.ingestion import parsers
9
+ from ragbits.document_search.ingestion.parsers.exceptions import ParserDocumentNotSupportedError
10
+
11
+
12
+ class DocumentParser(WithConstructionConfig, ABC):
13
+ """
14
+ Base class for document parsers, responsible for converting the document into a list of elements.
15
+ """
16
+
17
+ default_module: ClassVar[ModuleType | None] = parsers
18
+ configuration_key: ClassVar[str] = "parser"
19
+
20
+ supported_document_types: set[DocumentType] = set()
21
+
22
+ @abstractmethod
23
+ async def parse(self, document: Document) -> list[Element]:
24
+ """
25
+ Parse the document.
26
+
27
+ Args:
28
+ document: The document to parse.
29
+
30
+ Returns:
31
+ The list of elements extracted from the document.
32
+
33
+ Raises:
34
+ ParserError: If the parsing of the document failed.
35
+ """
36
+
37
+ @classmethod
38
+ def validate_document_type(cls, document_type: DocumentType) -> None:
39
+ """
40
+ Check if the parser supports the document type.
41
+
42
+ Args:
43
+ document_type: The document type to validate against the parser.
44
+
45
+ Raises:
46
+ ParserDocumentNotSupportedError: If the document type is not supported.
47
+ """
48
+ if document_type not in cls.supported_document_types:
49
+ raise ParserDocumentNotSupportedError(parser_name=cls.__name__, document_type=document_type)
50
+
51
+
52
+ class TextDocumentParser(DocumentParser):
53
+ """
54
+ Simple parser that maps a text to the text element.
55
+ """
56
+
57
+ supported_document_types = {DocumentType.TXT, DocumentType.MD}
58
+
59
+ async def parse(self, document: Document) -> list[Element]:
60
+ """
61
+ Parse the document.
62
+
63
+ Args:
64
+ document: The document to parse.
65
+
66
+ Returns:
67
+ List with an text element with the text content.
68
+
69
+ Raises:
70
+ ParserDocumentNotSupportedError: If the document type is not supported by the parser.
71
+ """
72
+ self.validate_document_type(document.metadata.document_type)
73
+ return [TextElement(content=document.local_path.read_text(), document_meta=document.metadata)]
74
+
75
+
76
+ class ImageDocumentParser(DocumentParser):
77
+ """
78
+ Simple parser that maps an image to the image element.
79
+ """
80
+
81
+ supported_document_types = {DocumentType.JPG, DocumentType.PNG}
82
+
83
+ async def parse(self, document: Document) -> list[Element]:
84
+ """
85
+ Parse the document.
86
+
87
+ Args:
88
+ document: The document to parse.
89
+
90
+ Returns:
91
+ List with an image element with the image content.
92
+
93
+ Raises:
94
+ ParserDocumentNotSupportedError: If the document type is not supported by the parser.
95
+ """
96
+ self.validate_document_type(document.metadata.document_type)
97
+ return [ImageElement(image_bytes=document.local_path.read_bytes(), document_meta=document.metadata)]