ws-bom-robot-app 0.0.73__py3-none-any.whl → 0.0.74__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ from langchain_core.language_models import BaseChatModel
7
7
  from langchain_core.vectorstores.base import VectorStoreRetriever, VectorStore
8
8
  from langchain.retrievers import SelfQueryRetriever
9
9
  from langchain.chains.query_constructor.schema import AttributeInfo
10
+ import tiktoken
10
11
 
11
12
  class VectorDBStrategy(ABC):
12
13
  class VectorDBStrategy:
@@ -49,6 +50,52 @@ class VectorDBStrategy(ABC):
49
50
  Asynchronously invokes multiple retrievers in parallel, then merges
50
51
  their results while removing duplicates.
51
52
  """
53
+ def __init__(self):
54
+ self.max_tokens_per_batch = 300_000 * 0.8 # conservative limit below 300k openai limit: https://platform.openai.com/docs/api-reference/embeddings/create
55
+ try:
56
+ self.encoding = tiktoken.get_encoding("cl100k_base") # text-embedding-3-small, text-embedding-3-large: https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
57
+ except Exception:
58
+ self.encoding = None
59
+
60
+ def _count_tokens(self, text: str) -> int:
61
+ """Count tokens in text using tiktoken or fallback estimation"""
62
+ if self.encoding:
63
+ try:
64
+ return len(self.encoding.encode(text))
65
+ except Exception:
66
+ pass
67
+ # fallback: rough estimation (1 token ≈ 4 characters)
68
+ return len(text) // 4
69
+
70
+ def _batch_documents_by_tokens(self, documents: list[Document]) -> list[list[Document]]:
71
+ """Split documents into batches based on token count"""
72
+ if not documents:
73
+ return []
74
+ batches = []
75
+ current_batch = []
76
+ current_token_count = 0
77
+
78
+ for doc in documents:
79
+ doc_tokens = self._count_tokens(doc.page_content)
80
+ # check if adding this document exceeds the limit
81
+ if current_token_count + doc_tokens > self.max_tokens_per_batch:
82
+ # start new batch if current batch is not empty
83
+ if current_batch:
84
+ batches.append(current_batch)
85
+ # reset current batch
86
+ current_batch = [doc]
87
+ current_token_count = doc_tokens # reset to current doc's tokens
88
+ else:
89
+ # add to current batch
90
+ current_batch.append(doc)
91
+ current_token_count += doc_tokens
92
+
93
+ # add final batch if not empty
94
+ if current_batch:
95
+ batches.append(current_batch)
96
+
97
+ return batches
98
+
52
99
  _CACHE: dict[str, VectorStore] = {}
53
100
  def _clear_cache(self, key: str):
54
101
  if key in self._CACHE:
@@ -38,6 +38,9 @@ class Chroma(VectorDBStrategy):
38
38
  Returns:
39
39
  CHROMA: The retrieved or newly created Chroma instance.
40
40
  """
41
+ def __init__(self):
42
+ super().__init__()
43
+
41
44
  async def create(
42
45
  self,
43
46
  embeddings: Embeddings,
@@ -47,19 +50,35 @@ class Chroma(VectorDBStrategy):
47
50
  ) -> Optional[str]:
48
51
  try:
49
52
  chunked_docs = DocumentChunker.chunk(documents)
50
- await asyncio.to_thread(
51
- CHROMA.from_documents,
52
- documents=chunked_docs,
53
- embedding=embeddings,
54
- persist_directory=storage_id
55
- )
56
- self._clear_cache(storage_id)
53
+ batches = self._batch_documents_by_tokens(chunked_docs)
54
+ logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
55
+ _instance: CHROMA = None
56
+ for i, batch in enumerate(batches):
57
+ batch_tokens = sum(self._count_tokens(doc.page_content) for doc in batch)
58
+ logging.info(f"processing batch {i+1}/{len(batches)} with {len(batch)} docs ({batch_tokens:,} tokens)")
59
+ # create instance from first batch
60
+ if _instance is None:
61
+ _instance = await asyncio.to_thread(
62
+ CHROMA.from_documents,
63
+ documents=batch,
64
+ embedding=embeddings,
65
+ persist_directory=storage_id
66
+ )
67
+ else:
68
+ # merge to existing instance
69
+ await _instance.aadd_documents(batch)
70
+ # add a small delay to avoid rate limiting
71
+ if i < len(batches) - 1: # except last batch
72
+ await asyncio.sleep(1)
73
+ if _instance:
74
+ self._clear_cache(storage_id)
75
+ logging.info(f"Successfully created {Chroma.__name__} index with {len(chunked_docs)} total documents")
57
76
  return storage_id
58
77
  except Exception as e:
59
78
  logging.error(f"{Chroma.__name__} create error: {e}")
60
79
  raise e
61
80
  finally:
62
- del documents
81
+ del documents, chunked_docs, _instance
63
82
  gc.collect()
64
83
 
65
84
  def get_loader(
@@ -22,6 +22,9 @@ class Faiss(VectorDBStrategy):
22
22
  was previously loaded and cached, it returns the cached instance; otherwise,
23
23
  it loads the index from local storage and caches it for subsequent use.
24
24
  """
25
+ def __init__(self):
26
+ super().__init__()
27
+
25
28
  async def create(
26
29
  self,
27
30
  embeddings: Embeddings,
@@ -31,19 +34,42 @@ class Faiss(VectorDBStrategy):
31
34
  ) -> Optional[str]:
32
35
  try:
33
36
  chunked_docs = DocumentChunker.chunk(documents)
34
- _instance = await asyncio.to_thread(
35
- FAISS.from_documents,
36
- chunked_docs,
37
- embeddings
38
- )
39
- await asyncio.to_thread(_instance.save_local, storage_id)
40
- self._clear_cache(storage_id)
37
+ batches = self._batch_documents_by_tokens(chunked_docs)
38
+ logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
39
+ _instance: FAISS = None
40
+ for i, batch in enumerate(batches):
41
+ batch_tokens = sum(self._count_tokens(doc.page_content) for doc in batch)
42
+ logging.info(f"processing batch {i+1}/{len(batches)} with {len(batch)} docs ({batch_tokens:,} tokens)")
43
+ # init
44
+ _batch_instance = await asyncio.to_thread(
45
+ FAISS.from_documents,
46
+ batch,
47
+ embeddings
48
+ )
49
+ # create instance from first batch
50
+ if _instance is None:
51
+ _instance = _batch_instance
52
+ else:
53
+ # merge to existing instance
54
+ await asyncio.to_thread(
55
+ _instance.merge_from,
56
+ _batch_instance
57
+ )
58
+ del _batch_instance
59
+ gc.collect()
60
+ # add a small delay to avoid rate limiting
61
+ if i < len(batches) - 1: # except last batch
62
+ await asyncio.sleep(1)
63
+ if _instance:
64
+ await asyncio.to_thread(_instance.save_local, storage_id)
65
+ self._clear_cache(storage_id)
66
+ logging.info(f"Successfully created {Faiss.__name__} index with {len(chunked_docs)} total documents")
41
67
  return storage_id
42
68
  except Exception as e:
43
69
  logging.error(f"{Faiss.__name__} create error: {e}")
44
70
  raise e
45
71
  finally:
46
- del documents, _instance
72
+ del documents, chunked_docs, _instance
47
73
  gc.collect()
48
74
 
49
75
  def get_loader(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ws_bom_robot_app
3
- Version: 0.0.73
3
+ Version: 0.0.74
4
4
  Summary: A FastAPI application serving ws bom/robot/llm platform ai.
5
5
  Home-page: https://github.com/websolutespa/bom
6
6
  Author: Websolute Spa
@@ -19,7 +19,7 @@ Requires-Dist: fastapi[standard]==0.115.14
19
19
  Requires-Dist: chevron==0.14.0
20
20
  Requires-Dist: langchain==0.3.26
21
21
  Requires-Dist: langchain-community==0.3.26
22
- Requires-Dist: langchain-core==0.3.67
22
+ Requires-Dist: langchain-core==0.3.72
23
23
  Requires-Dist: langchain-openai==0.3.27
24
24
  Requires-Dist: langchain-anthropic==0.3.6
25
25
  Requires-Dist: langchain-ibm==0.3.14
@@ -28,8 +28,8 @@ Requires-Dist: langchain-google-vertexai==2.0.27
28
28
  Requires-Dist: langchain-groq==0.3.5
29
29
  Requires-Dist: langchain-ollama==0.3.3
30
30
  Requires-Dist: faiss-cpu==1.11.0
31
- Requires-Dist: chromadb==1.0.13
32
- Requires-Dist: langchain_chroma==0.2.4
31
+ Requires-Dist: chromadb==1.0.15
32
+ Requires-Dist: langchain_chroma==0.2.5
33
33
  Requires-Dist: fastembed==0.7.1
34
34
  Requires-Dist: langchain-qdrant==0.2.0
35
35
  Requires-Dist: qdrant-client==1.15.0
@@ -42,9 +42,9 @@ ws_bom_robot_app/llm/utils/webhooks.py,sha256=LAAZqyN6VhV13wu4X-X85TwdDgAV2rNvIw
42
42
  ws_bom_robot_app/llm/vector_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
43
  ws_bom_robot_app/llm/vector_store/generator.py,sha256=9_xdtCKJhmt1OP0GXDjvFERXMP7ozLZT92KuYEBDgC0,6314
44
44
  ws_bom_robot_app/llm/vector_store/db/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- ws_bom_robot_app/llm/vector_store/db/base.py,sha256=rNIYHPDXhVyoP9AJKRbGT5Vh5HzcKYx8MUIhEuCVGW4,6491
46
- ws_bom_robot_app/llm/vector_store/db/chroma.py,sha256=3UXR7PZidFxgI5jlC0WWPAJ0NGRI2AqSBVlL9VZOJgw,3356
47
- ws_bom_robot_app/llm/vector_store/db/faiss.py,sha256=aKj8EbM6VU5FLBvVQDz4c2aihvY1O3LiVIjzzxGmehw,2492
45
+ ws_bom_robot_app/llm/vector_store/db/base.py,sha256=t0Z1VCcg604evEzJENGNqYFBi_AZLTEUzmxA5wgoE_A,8419
46
+ ws_bom_robot_app/llm/vector_store/db/chroma.py,sha256=2riMQvwe2T99X_NtO9yO9lpZ0zj2Nb06l9Hb1lWJ00E,4509
47
+ ws_bom_robot_app/llm/vector_store/db/faiss.py,sha256=Y2LpMsU0Ce2RCaGM1n69BxMpXWXpBoj1T5aAAJpX2qE,3860
48
48
  ws_bom_robot_app/llm/vector_store/db/manager.py,sha256=5rqBvc0QKmHFUgVHqBAr1Y4FZRl-w-ylGMjgXZywrdA,533
49
49
  ws_bom_robot_app/llm/vector_store/db/qdrant.py,sha256=HfEtFqMF0wIn5SNbst6glw7gG4nYEgSF3S-4RjTaM6g,2068
50
50
  ws_bom_robot_app/llm/vector_store/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -67,7 +67,7 @@ ws_bom_robot_app/llm/vector_store/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5
67
67
  ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=L_ugekNuAq0N9O-24wtlHSNHkqSeD-KsJrfGt_FX9Oc,5340
68
68
  ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=yP0zgXLeFAlByaYuj-6cYariuknckrFds0dxdRcnVz8,3456
69
69
  ws_bom_robot_app/llm/vector_store/loader/json_loader.py,sha256=qo9ejRZyKv_k6jnGgXnu1W5uqsMMtgqK_uvPpZQ0p74,833
70
- ws_bom_robot_app-0.0.73.dist-info/METADATA,sha256=dBHcbQv5RaJypA5WcIR_2zZuWyw6IQWevnUpPoBVlFw,8609
71
- ws_bom_robot_app-0.0.73.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
72
- ws_bom_robot_app-0.0.73.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
73
- ws_bom_robot_app-0.0.73.dist-info/RECORD,,
70
+ ws_bom_robot_app-0.0.74.dist-info/METADATA,sha256=yoc6qsnTaKCpOXJjc1yCWrKtnEE5vqKvx_CyxQm2s08,8609
71
+ ws_bom_robot_app-0.0.74.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
72
+ ws_bom_robot_app-0.0.74.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
73
+ ws_bom_robot_app-0.0.74.dist-info/RECORD,,