ws-bom-robot-app 0.0.103__tar.gz → 0.0.105__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {ws_bom_robot_app-0.0.103/ws_bom_robot_app.egg-info → ws_bom_robot_app-0.0.105}/PKG-INFO +3 -3
  2. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/README.md +2 -2
  3. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/setup.py +1 -1
  4. ws_bom_robot_app-0.0.105/ws_bom_robot_app/llm/utils/chunker.py +82 -0
  5. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/db/chroma.py +1 -0
  6. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/shopify.py +7 -3
  7. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/loader/docling.py +3 -2
  8. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105/ws_bom_robot_app.egg-info}/PKG-INFO +3 -3
  9. ws_bom_robot_app-0.0.103/ws_bom_robot_app/llm/utils/chunker.py +0 -20
  10. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/MANIFEST.in +0 -0
  11. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/pyproject.toml +0 -0
  12. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/requirements.txt +0 -0
  13. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/setup.cfg +0 -0
  14. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/__init__.py +0 -0
  15. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/auth.py +0 -0
  16. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/config.py +0 -0
  17. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/cron_manager.py +0 -0
  18. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/__init__.py +0 -0
  19. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/agent_context.py +0 -0
  20. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/agent_description.py +0 -0
  21. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/agent_handler.py +0 -0
  22. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/agent_lcel.py +0 -0
  23. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/api.py +0 -0
  24. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/defaut_prompt.py +0 -0
  25. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/evaluator.py +0 -0
  26. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/feedbacks/__init__.py +0 -0
  27. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/feedbacks/feedback_manager.py +0 -0
  28. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/main.py +0 -0
  29. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/models/__init__.py +0 -0
  30. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/models/api.py +0 -0
  31. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/models/base.py +0 -0
  32. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/models/feedback.py +0 -0
  33. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/models/kb.py +0 -0
  34. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/nebuly_handler.py +0 -0
  35. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/providers/__init__.py +0 -0
  36. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/providers/llm_manager.py +0 -0
  37. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/tools/__init__.py +0 -0
  38. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/tools/models/__init__.py +0 -0
  39. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/tools/models/main.py +0 -0
  40. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/tools/tool_builder.py +0 -0
  41. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/tools/tool_manager.py +0 -0
  42. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/tools/utils.py +0 -0
  43. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/utils/__init__.py +0 -0
  44. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/utils/agent.py +0 -0
  45. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/utils/cleanup.py +0 -0
  46. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/utils/cms.py +0 -0
  47. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/utils/download.py +0 -0
  48. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/utils/print.py +0 -0
  49. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/utils/secrets.py +0 -0
  50. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/utils/webhooks.py +0 -0
  51. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/__init__.py +0 -0
  52. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/db/__init__.py +0 -0
  53. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/db/base.py +0 -0
  54. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/db/faiss.py +0 -0
  55. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/db/manager.py +0 -0
  56. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/db/qdrant.py +0 -0
  57. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/generator.py +0 -0
  58. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/__init__.py +0 -0
  59. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/api.py +0 -0
  60. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/azure.py +0 -0
  61. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/base.py +0 -0
  62. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/confluence.py +0 -0
  63. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/dropbox.py +0 -0
  64. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/gcs.py +0 -0
  65. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/github.py +0 -0
  66. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/googledrive.py +0 -0
  67. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/jira.py +0 -0
  68. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/manager.py +0 -0
  69. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/s3.py +0 -0
  70. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/sftp.py +0 -0
  71. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +0 -0
  72. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/sitemap.py +0 -0
  73. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/slack.py +0 -0
  74. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/integration/thron.py +0 -0
  75. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/loader/__init__.py +0 -0
  76. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/loader/base.py +0 -0
  77. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/llm/vector_store/loader/json_loader.py +0 -0
  78. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/main.py +0 -0
  79. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/subprocess_runner.py +0 -0
  80. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/task_manager.py +0 -0
  81. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app/util.py +0 -0
  82. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app.egg-info/SOURCES.txt +0 -0
  83. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app.egg-info/dependency_links.txt +0 -0
  84. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app.egg-info/requires.txt +0 -0
  85. {ws_bom_robot_app-0.0.103 → ws_bom_robot_app-0.0.105}/ws_bom_robot_app.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.103
3
+ Version: 0.0.105
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -317,7 +317,7 @@ code quality tools
317
317
 
318
318
  ```pwsh
319
319
  # .\src\robot
320
- !py -m uv pip install -U scanreq prospector[with_everything]
320
+ uv pip install -U scanreq prospector[with_everything]
321
321
  ## unused requirements
322
322
  scanreq -r requirements.txt -p ./ws_bom_robot_app
323
323
  ## style/linting
@@ -333,7 +333,7 @@ prospector ./ws_bom_robot_app -t pyroma
333
333
  #### 🧪 run tests
334
334
 
335
335
  ```pwsh
336
- !py -m uv pip install -U pytest pytest-asyncio pytest-mock pytest-cov pyclean
336
+ uv pip install -U pytest pytest-asyncio pytest-mock pytest-cov pyclean
337
337
  # clean cache if needed
338
338
  # pyclean --verbose .
339
339
  pytest --cov=ws_bom_robot_app --log-cli-level=info
@@ -250,7 +250,7 @@ code quality tools
250
250
 
251
251
  ```pwsh
252
252
  # .\src\robot
253
- !py -m uv pip install -U scanreq prospector[with_everything]
253
+ uv pip install -U scanreq prospector[with_everything]
254
254
  ## unused requirements
255
255
  scanreq -r requirements.txt -p ./ws_bom_robot_app
256
256
  ## style/linting
@@ -266,7 +266,7 @@ prospector ./ws_bom_robot_app -t pyroma
266
266
  #### 🧪 run tests
267
267
 
268
268
  ```pwsh
269
- !py -m uv pip install -U pytest pytest-asyncio pytest-mock pytest-cov pyclean
269
+ uv pip install -U pytest pytest-asyncio pytest-mock pytest-cov pyclean
270
270
  # clean cache if needed
271
271
  # pyclean --verbose .
272
272
  pytest --cov=ws_bom_robot_app --log-cli-level=info
@@ -4,7 +4,7 @@ _requirements = [line.split('#')[0].strip() for line in open("requirements.txt")
4
4
 
5
5
  setup(
6
6
  name="ws_bom_robot_app",
7
- version="0.0.103",
7
+ version="0.0.105",
8
8
  description="A FastAPI application serving ws bom/robot/llm platform ai.",
9
9
  long_description=open("README.md", encoding='utf-8').read(),
10
10
  long_description_content_type="text/markdown",
@@ -0,0 +1,82 @@
1
+ from langchain_core.documents import Document
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+
4
+ class DocumentChunker:
5
+ @staticmethod
6
+ def chunk(documents: list[Document]) -> list[Document]:
7
+ return DocumentChunker.chunk_recursive(documents)
8
+
9
+ @staticmethod
10
+ def chunk_recursive(documents: list[Document], chunk_size: int=3_000) -> list[Document]:
11
+ """
12
+ Recursively split documents into smaller chunks while preserving metadata.
13
+
14
+ This function takes a list of documents and splits them into smaller chunks using
15
+ RecursiveCharacterTextSplitter. Documents smaller than the chunk size are kept intact,
16
+ while larger documents are split into multiple chunks with overlapping content.
17
+
18
+ Args:
19
+ documents (list[Document]): A list of Document objects to be chunked.
20
+ chunk_size (int, optional): The maximum size of each chunk in characters.
21
+ Defaults to 3,000.
22
+
23
+ Returns:
24
+ list[Document]: A list of Document objects where each document's content is
25
+ at most chunk_size characters. Each chunk preserves the metadata from
26
+ its original document.
27
+
28
+ Notes:
29
+ - Chunk overlap is automatically set to 10% of the chunk_size to maintain
30
+ context between chunks.
31
+ - Documents smaller than or equal to chunk_size are returned unchanged.
32
+ - Metadata from the original document is copied to all resulting chunks.
33
+ """
34
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_size//10))
35
+ chunked_documents = []
36
+ for doc in documents:
37
+ if len(doc.page_content) <= chunk_size:
38
+ chunked_documents.append(doc)
39
+ continue
40
+ chunks = text_splitter.split_text(doc.page_content)
41
+ for chunk in chunks:
42
+ chunked_documents.append(
43
+ Document(page_content=chunk, metadata=doc.metadata)
44
+ )
45
+ return chunked_documents
46
+
47
+ @staticmethod
48
+ def chunk_token(documents: list[Document], max_tokens: int=1_000) -> list[Document]:
49
+ """
50
+ Splits a list of documents into smaller chunks based on token count.
51
+
52
+ This function takes a list of Document objects and splits them into smaller chunks
53
+ using a recursive character text splitter based on tiktoken encoding. Each chunk
54
+ respects the maximum token limit while maintaining some overlap between consecutive
55
+ chunks for context preservation.
56
+
57
+ Args:
58
+ documents (list[Document]): A list of Document objects to be chunked. Each Document
59
+ should have 'page_content' (str) and 'metadata' (dict) attributes.
60
+ max_tokens (int, optional): The maximum number of tokens allowed per chunk.
61
+ Defaults to 1,000. The chunk overlap is automatically set to 10% of this value.
62
+
63
+ Returns:
64
+ list[Document]: A list of new Document objects where each document represents a chunk
65
+ of the original documents. Each chunked Document preserves the metadata from its
66
+ source document.
67
+
68
+ Note:
69
+ - Uses the "cl100k_base" tiktoken encoding (commonly used for GPT-4 and similar models)
70
+ - Chunk overlap is set to max_tokens // 10 to maintain context between chunks
71
+ - Original document metadata is preserved in all generated chunks
72
+ """
73
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(encoding_name="cl100k_base",chunk_size=max_tokens, chunk_overlap=max_tokens//10)
74
+ chunked_documents = []
75
+ for doc in documents:
76
+ chunks = text_splitter.split_text(doc.page_content)
77
+ for chunk in chunks:
78
+ chunked_documents.append(
79
+ Document(page_content=chunk, metadata=doc.metadata)
80
+ )
81
+ return chunked_documents
82
+
@@ -63,6 +63,7 @@ class Chroma(VectorDBStrategy):
63
63
  CHROMA.from_documents,
64
64
  documents=batch,
65
65
  embedding=embeddings,
66
+ collection_name="default",
66
67
  persist_directory=storage_id
67
68
  )
68
69
  else:
@@ -19,6 +19,7 @@ class ShopifyParams(BaseModel):
19
19
  shop_name: str = Field(validation_alias=AliasChoices("shopName","shop_name"))
20
20
  access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
21
21
  graphql_query: Union[str, dict] = Field(validation_alias=AliasChoices("graphqlQuery","graphql_query"))
22
+ filter_handle: Optional[List[str]] = Field(default=None, validation_alias=AliasChoices("filterHandle","filter_handle"))
22
23
 
23
24
  @field_validator('graphql_query')
24
25
  @classmethod
@@ -124,8 +125,6 @@ class Shopify(IntegrationStrategy):
124
125
  cursor = page_info["endCursor"]
125
126
 
126
127
  print(f"Recuperati {len(edges)} prodotti. Totale: {len(all_data)}")
127
-
128
- # Piccola pausa per evitare di saturare l'API
129
128
  await asyncio.sleep(0.1)
130
129
 
131
130
  except aiohttp.ClientError as e:
@@ -140,4 +139,9 @@ class Shopify(IntegrationStrategy):
140
139
  raise Exception("Too many network errors. Stopping execution.")
141
140
 
142
141
  logging.info(f"Data retrieval completed! Total data: {len(all_data)}")
143
- return all_data
142
+ return self.__filter_by_handle(all_data)
143
+
144
+ def __filter_by_handle(self, data: List[dict]) -> List[dict]:
145
+ if not self.__data.filter_handle:
146
+ return data
147
+ return [item for item in data if item.get('handle') not in self.__data.filter_handle]
@@ -85,10 +85,11 @@ class DoclingLoader(BaseLoader):
85
85
  def lazy_load(self) -> Iterator[Document]:
86
86
  for source in self._file_paths:
87
87
  try:
88
- #manage only small file with header, preventing header stripping and improper chunking
88
+ #manage only small file with header, preventing header stripping and improper chunking (due to conversion in markdown table format)
89
89
  if (source.endswith('.csv') or source.endswith('.xlsx')) \
90
90
  and 'fallback' in self._kwargs \
91
- and os.path.getsize(source) > (VectorDBStrategy.MAX_TOKENS_PER_BATCH // 4): #rough token estimate
91
+ and os.path.getsize(source) > (3_000 //
92
+ (10 if source.endswith('.xlsx') else 1)): #approx 3000 chars; CSV: size ≈ chars, XLSX: compressed ~5-10x, use conservative estimate
92
93
  yield from self._fallback_loader(source)
93
94
  else:
94
95
  _result = self._converter.convert(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.103
3
+ Version: 0.0.105
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -317,7 +317,7 @@ code quality tools
317
317
 
318
318
  ```pwsh
319
319
  # .\src\robot
320
- !py -m uv pip install -U scanreq prospector[with_everything]
320
+ uv pip install -U scanreq prospector[with_everything]
321
321
  ## unused requirements
322
322
  scanreq -r requirements.txt -p ./ws_bom_robot_app
323
323
  ## style/linting
@@ -333,7 +333,7 @@ prospector ./ws_bom_robot_app -t pyroma
333
333
  #### 🧪 run tests
334
334
 
335
335
  ```pwsh
336
- !py -m uv pip install -U pytest pytest-asyncio pytest-mock pytest-cov pyclean
336
+ uv pip install -U pytest pytest-asyncio pytest-mock pytest-cov pyclean
337
337
  # clean cache if needed
338
338
  # pyclean --verbose .
339
339
  pytest --cov=ws_bom_robot_app --log-cli-level=info
@@ -1,20 +0,0 @@
1
- from langchain_core.documents import Document
2
- from langchain_text_splitters import CharacterTextSplitter
3
- import logging
4
-
5
- class DocumentChunker:
6
- _MAX_CHUNK_SIZE = 10_000
7
- @staticmethod
8
- def chunk(documents: list[Document]) -> list[Document]:
9
- text_splitter = CharacterTextSplitter(chunk_size=DocumentChunker._MAX_CHUNK_SIZE, chunk_overlap=int(DocumentChunker._MAX_CHUNK_SIZE * 0.02))
10
- chunked_documents = []
11
- for doc in documents:
12
- if len(doc.page_content) <= DocumentChunker._MAX_CHUNK_SIZE:
13
- chunked_documents.append(doc)
14
- continue
15
- chunks = text_splitter.split_text(doc.page_content)
16
- for chunk in chunks:
17
- chunked_documents.append(
18
- Document(page_content=chunk, metadata=doc.metadata)
19
- )
20
- return chunked_documents