ws-bom-robot-app 0.0.103__py3-none-any.whl → 0.0.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,82 @@
1
1
  from langchain_core.documents import Document
2
- from langchain_text_splitters import CharacterTextSplitter
3
- import logging
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
4
3
 
5
4
  class DocumentChunker:
6
- _MAX_CHUNK_SIZE = 10_000
7
5
  @staticmethod
8
6
  def chunk(documents: list[Document]) -> list[Document]:
9
- text_splitter = CharacterTextSplitter(chunk_size=DocumentChunker._MAX_CHUNK_SIZE, chunk_overlap=int(DocumentChunker._MAX_CHUNK_SIZE * 0.02))
10
- chunked_documents = []
11
- for doc in documents:
12
- if len(doc.page_content) <= DocumentChunker._MAX_CHUNK_SIZE:
13
- chunked_documents.append(doc)
14
- continue
15
- chunks = text_splitter.split_text(doc.page_content)
16
- for chunk in chunks:
17
- chunked_documents.append(
18
- Document(page_content=chunk, metadata=doc.metadata)
19
- )
20
- return chunked_documents
7
+ return DocumentChunker.chunk_recursive(documents)
8
+
9
+ @staticmethod
10
+ def chunk_recursive(documents: list[Document], chunk_size: int=3_000) -> list[Document]:
11
+ """
12
+ Recursively split documents into smaller chunks while preserving metadata.
13
+
14
+ This function takes a list of documents and splits them into smaller chunks using
15
+ RecursiveCharacterTextSplitter. Documents smaller than the chunk size are kept intact,
16
+ while larger documents are split into multiple chunks with overlapping content.
17
+
18
+ Args:
19
+ documents (list[Document]): A list of Document objects to be chunked.
20
+ chunk_size (int, optional): The maximum size of each chunk in characters.
21
+ Defaults to 3,000.
22
+
23
+ Returns:
24
+ list[Document]: A list of Document objects where each document's content is
25
+ at most chunk_size characters. Each chunk preserves the metadata from
26
+ its original document.
27
+
28
+ Notes:
29
+ - Chunk overlap is automatically set to 10% of the chunk_size to maintain
30
+ context between chunks.
31
+ - Documents smaller than or equal to chunk_size are returned unchanged.
32
+ - Metadata from the original document is copied to all resulting chunks.
33
+ """
34
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_size//10))
35
+ chunked_documents = []
36
+ for doc in documents:
37
+ if len(doc.page_content) <= chunk_size:
38
+ chunked_documents.append(doc)
39
+ continue
40
+ chunks = text_splitter.split_text(doc.page_content)
41
+ for chunk in chunks:
42
+ chunked_documents.append(
43
+ Document(page_content=chunk, metadata=doc.metadata)
44
+ )
45
+ return chunked_documents
46
+
47
+ @staticmethod
48
+ def chunk_token(documents: list[Document], max_tokens: int=1_000) -> list[Document]:
49
+ """
50
+ Splits a list of documents into smaller chunks based on token count.
51
+
52
+ This function takes a list of Document objects and splits them into smaller chunks
53
+ using a recursive character text splitter based on tiktoken encoding. Each chunk
54
+ respects the maximum token limit while maintaining some overlap between consecutive
55
+ chunks for context preservation.
56
+
57
+ Args:
58
+ documents (list[Document]): A list of Document objects to be chunked. Each Document
59
+ should have 'page_content' (str) and 'metadata' (dict) attributes.
60
+ max_tokens (int, optional): The maximum number of tokens allowed per chunk.
61
+ Defaults to 1,000. The chunk overlap is automatically set to 10% of this value.
62
+
63
+ Returns:
64
+ list[Document]: A list of new Document objects where each document represents a chunk
65
+ of the original documents. Each chunked Document preserves the metadata from its
66
+ source document.
67
+
68
+ Note:
69
+ - Uses the "cl100k_base" tiktoken encoding (commonly used for GPT-4 and similar models)
70
+ - Chunk overlap is set to max_tokens // 10 to maintain context between chunks
71
+ - Original document metadata is preserved in all generated chunks
72
+ """
73
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(encoding_name="cl100k_base",chunk_size=max_tokens, chunk_overlap=max_tokens//10)
74
+ chunked_documents = []
75
+ for doc in documents:
76
+ chunks = text_splitter.split_text(doc.page_content)
77
+ for chunk in chunks:
78
+ chunked_documents.append(
79
+ Document(page_content=chunk, metadata=doc.metadata)
80
+ )
81
+ return chunked_documents
82
+
@@ -63,6 +63,7 @@ class Chroma(VectorDBStrategy):
63
63
  CHROMA.from_documents,
64
64
  documents=batch,
65
65
  embedding=embeddings,
66
+ collection_name="default",
66
67
  persist_directory=storage_id
67
68
  )
68
69
  else:
@@ -19,6 +19,7 @@ class ShopifyParams(BaseModel):
19
19
  shop_name: str = Field(validation_alias=AliasChoices("shopName","shop_name"))
20
20
  access_token: str = Field(validation_alias=AliasChoices("accessToken","access_token"))
21
21
  graphql_query: Union[str, dict] = Field(validation_alias=AliasChoices("graphqlQuery","graphql_query"))
22
+ filter_handle: Optional[List[str]] = Field(default=None, validation_alias=AliasChoices("filterHandle","filter_handle"))
22
23
 
23
24
  @field_validator('graphql_query')
24
25
  @classmethod
@@ -124,8 +125,6 @@ class Shopify(IntegrationStrategy):
124
125
  cursor = page_info["endCursor"]
125
126
 
126
127
  print(f"Recuperati {len(edges)} prodotti. Totale: {len(all_data)}")
127
-
128
- # Piccola pausa per evitare di saturare l'API
129
128
  await asyncio.sleep(0.1)
130
129
 
131
130
  except aiohttp.ClientError as e:
@@ -140,4 +139,9 @@ class Shopify(IntegrationStrategy):
140
139
  raise Exception("Too many network errors. Stopping execution.")
141
140
 
142
141
  logging.info(f"Data retrieval completed! Total data: {len(all_data)}")
143
- return all_data
142
+ return self.__filter_by_handle(all_data)
143
+
144
+ def __filter_by_handle(self, data: List[dict]) -> List[dict]:
145
+ if not self.__data.filter_handle:
146
+ return data
147
+ return [item for item in data if item.get('handle') not in self.__data.filter_handle]
@@ -85,10 +85,11 @@ class DoclingLoader(BaseLoader):
85
85
  def lazy_load(self) -> Iterator[Document]:
86
86
  for source in self._file_paths:
87
87
  try:
88
- #manage only small file with header, preventing header stripping and improper chunking
88
+ #manage only small file with header, preventing header stripping and improper chunking (due to conversion in markdown table format)
89
89
  if (source.endswith('.csv') or source.endswith('.xlsx')) \
90
90
  and 'fallback' in self._kwargs \
91
- and os.path.getsize(source) > (VectorDBStrategy.MAX_TOKENS_PER_BATCH // 4): #rough token estimate
91
+ and os.path.getsize(source) > (3_000 //
92
+ (10 if source.endswith('.xlsx') else 1)): #approx 3000 chars; CSV: size ≈ chars, XLSX: compressed ~5-10x, use conservative estimate
92
93
  yield from self._fallback_loader(source)
93
94
  else:
94
95
  _result = self._converter.convert(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.103
3
+ Version: 0.0.105
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -317,7 +317,7 @@ code quality tools
317
317
 
318
318
  ```pwsh
319
319
  # .\src\robot
320
- !py -m uv pip install -U scanreq prospector[with_everything]
320
+ uv pip install -U scanreq prospector[with_everything]
321
321
  ## unused requirements
322
322
  scanreq -r requirements.txt -p ./ws_bom_robot_app
323
323
  ## style/linting
@@ -333,7 +333,7 @@ prospector ./ws_bom_robot_app -t pyroma
333
333
  #### 🧪 run tests
334
334
 
335
335
  ```pwsh
336
- !py -m uv pip install -U pytest pytest-asyncio pytest-mock pytest-cov pyclean
336
+ uv pip install -U pytest pytest-asyncio pytest-mock pytest-cov pyclean
337
337
  # clean cache if needed
338
338
  # pyclean --verbose .
339
339
  pytest --cov=ws_bom_robot_app --log-cli-level=info
@@ -33,7 +33,7 @@ ws_bom_robot_app/llm/tools/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
33
33
  ws_bom_robot_app/llm/tools/models/main.py,sha256=1hICqHs-KS2heenkH7b2eH0N2GrPaaNGBrn64cl_A40,827
34
34
  ws_bom_robot_app/llm/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  ws_bom_robot_app/llm/utils/agent.py,sha256=uFuSfYMfGIE2WCKGNSKL-T2SDFn-tUKvbAYbGTPIw6g,1445
36
- ws_bom_robot_app/llm/utils/chunker.py,sha256=zVXjRMloc3KbNEqiDcycYzy4N0Ey1g8XYeq6ftyvkyg,857
36
+ ws_bom_robot_app/llm/utils/chunker.py,sha256=u0l2t3bIQihOpLRlcrO23bNBda3kgzayyGAIR0YZUqQ,4069
37
37
  ws_bom_robot_app/llm/utils/cleanup.py,sha256=ARLZTX4mLbkLCEnMdIWYDYEAPOjzfy1laLGkYnxZe30,3063
38
38
  ws_bom_robot_app/llm/utils/cms.py,sha256=gfIXvY3DxgbgDf0LCzyekWitaduxKGLHfV6gbRmh8zk,6960
39
39
  ws_bom_robot_app/llm/utils/download.py,sha256=rvc88E63UGHnFVlJJeMb05Z2FcBYIITqKnIE3ldEu6I,7293
@@ -44,7 +44,7 @@ ws_bom_robot_app/llm/vector_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
44
44
  ws_bom_robot_app/llm/vector_store/generator.py,sha256=W_hi_UOPaSjnEuazhUFIrMAwTvz64Du8_gpiVAxFlVc,6451
45
45
  ws_bom_robot_app/llm/vector_store/db/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
46
  ws_bom_robot_app/llm/vector_store/db/base.py,sha256=pIlHTg83bLdGfbZorilSqeJ5QKgpGU8fxF3c-5pLWJo,8490
47
- ws_bom_robot_app/llm/vector_store/db/chroma.py,sha256=s_RH16do52_5ejWgjlzp2cGTonGFIkalp0V3L3gbTnU,4574
47
+ ws_bom_robot_app/llm/vector_store/db/chroma.py,sha256=oe0p_OlqTGFXEvtC8AXiElUWnfHXTXk97_suYh7kirU,4622
48
48
  ws_bom_robot_app/llm/vector_store/db/faiss.py,sha256=rCMq_dhg1-NM8G5L_VEdDIvgmkWLXL3r5EreaqxR3Oc,3925
49
49
  ws_bom_robot_app/llm/vector_store/db/manager.py,sha256=5rqBvc0QKmHFUgVHqBAr1Y4FZRl-w-ylGMjgXZywrdA,533
50
50
  ws_bom_robot_app/llm/vector_store/db/qdrant.py,sha256=-36YOXjNtDeWveREnGd1SZF3hT7_Peg_pAT5uoxXcQU,3237
@@ -62,15 +62,15 @@ ws_bom_robot_app/llm/vector_store/integration/manager.py,sha256=K_Ymfb4xqm33g7gy
62
62
  ws_bom_robot_app/llm/vector_store/integration/s3.py,sha256=_SAuPfyK7lIz7Jq1LiBavkF1lre5yqe6DGlMYnxMa4o,3317
63
63
  ws_bom_robot_app/llm/vector_store/integration/sftp.py,sha256=g6f-FKkEktx7nJahb7RKyQ4pM9wGik0_xXMDfWup-1c,2845
64
64
  ws_bom_robot_app/llm/vector_store/integration/sharepoint.py,sha256=DhBcAwgr1u-dQ_8TxeLPu7kzr_EDogCRQeBrIULtWfo,4898
65
- ws_bom_robot_app/llm/vector_store/integration/shopify.py,sha256=Q0W3rRV-3xox303KhSiiIxTJNIXIbMZiF7yME8dW-FE,5485
65
+ ws_bom_robot_app/llm/vector_store/integration/shopify.py,sha256=_XlJrN9diW5WrLhNwnnEzFOgvWbjxQJZ54hZ2vKtgxM,5788
66
66
  ws_bom_robot_app/llm/vector_store/integration/sitemap.py,sha256=YKQ_0VUSW9NQ3svVKuas2OLk_fsTQuxg4B_zCBzKx_s,5282
67
67
  ws_bom_robot_app/llm/vector_store/integration/slack.py,sha256=hiE1kkg7868mbP2wVWQLmC1fK2jIE1lT7f8hVN0NqeY,2636
68
68
  ws_bom_robot_app/llm/vector_store/integration/thron.py,sha256=6XefkQxS-qF4yAH_sH1n2EONZvTiWiAAx_bb24y8QEQ,9330
69
69
  ws_bom_robot_app/llm/vector_store/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
70
  ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=InpRwKPxp0tuM4drezBvxxAWHe3XTmu60MGvFsT7RPE,7176
71
- ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=dLvpOi4EH0jyx06IrHoanfLRPFXLJi9BU2BWYcaw-U4,5000
71
+ ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=RFYSZkZAYtU8wJSd1rN2T0lVo-wK1-ddtr6bH2fBr6Q,5170
72
72
  ws_bom_robot_app/llm/vector_store/loader/json_loader.py,sha256=LDppW0ZATo4_1hh-KlsAM3TLawBvwBxva_a7k5Oz1sc,858
73
- ws_bom_robot_app-0.0.103.dist-info/METADATA,sha256=tGnPkamyrowzbXt61Lrh6KtEkTK6nccaQqgLz6zq9rM,11025
74
- ws_bom_robot_app-0.0.103.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
75
- ws_bom_robot_app-0.0.103.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
76
- ws_bom_robot_app-0.0.103.dist-info/RECORD,,
73
+ ws_bom_robot_app-0.0.105.dist-info/METADATA,sha256=dOAkYLVKxkr3-FcVVFK475-jH80yd_s3DQSB4bLnl0c,11011
74
+ ws_bom_robot_app-0.0.105.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
75
+ ws_bom_robot_app-0.0.105.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
76
+ ws_bom_robot_app-0.0.105.dist-info/RECORD,,