ddi-fw 0.0.260__tar.gz → 0.0.262__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/PKG-INFO +1 -1
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/pyproject.toml +1 -1
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/langchain/__init__.py +1 -1
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/langchain/chroma_storage.py +9 -3
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/langchain/faiss_storage.py +12 -7
- ddi_fw-0.0.262/src/ddi_fw/langchain/sentence_splitter.py +17 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw.egg-info/PKG-INFO +1 -1
- ddi_fw-0.0.260/src/ddi_fw/langchain/sentence_splitter.py +0 -10
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/README.md +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/setup.cfg +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/datasets/__init__.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/datasets/core.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/datasets/db_utils.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/datasets/setup_._py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/langchain/embeddings.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/langchain/storage.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ml/__init__.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ml/evaluation_helper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ml/ml_helper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ml/model_wrapper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ml/tracking_service.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ner/__init__.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ner/mmlrestclient.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/ner/ner.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/pipeline/__init__.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/pipeline/multi_pipeline.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/pipeline/multi_pipeline_org.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/pipeline/pipeline.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/__init__.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/enums.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/json_helper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/kaggle.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/numpy_utils.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/package_helper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/py7zr_helper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/utils.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/utils/zip_helper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/vectorization/__init__.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw/vectorization/idf_helper.py +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw.egg-info/SOURCES.txt +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw.egg-info/requires.txt +0 -0
- {ddi_fw-0.0.260 → ddi_fw-0.0.262}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,5 +1,5 @@
|
|
1
1
|
from ..langchain.embeddings import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
|
2
|
-
from .sentence_splitter import SentenceSplitter
|
2
|
+
from .sentence_splitter import SentenceSplitter,PassthroughTextSplitter
|
3
3
|
# from .storage import DataFrameToVectorDB, generate_embeddings
|
4
4
|
from .faiss_storage import BaseVectorStoreManager, FaissVectorStoreManager,generate_embeddings
|
5
5
|
from .chroma_storage import ChromaVectorStoreManager
|
@@ -153,8 +153,8 @@ class ChromaVectorStoreManager(BaseVectorStoreManager):
|
|
153
153
|
# print(f"{page_content_column}, size:{len(split_docs_chunk)}")
|
154
154
|
|
155
155
|
@staticmethod
|
156
|
-
def get_persist_dir(base_dir, suffix, config=None):
|
157
|
-
return f"{base_dir}"
|
156
|
+
def get_persist_dir(base_dir,id, suffix, config=None):
|
157
|
+
return f"{base_dir}/chroma_db/{id}"
|
158
158
|
|
159
159
|
def generate_vector_store(self, docs: List[Document]):
|
160
160
|
self.vector_store = Chroma(
|
@@ -267,7 +267,13 @@ class ChromaVectorStoreManager(BaseVectorStoreManager):
|
|
267
267
|
# Ensure all lists are not None and have the same length
|
268
268
|
docs = results.get('documents', []) or []
|
269
269
|
metadatas = results.get('metadatas', []) or []
|
270
|
-
embeddings = results.get('embeddings', []) or []
|
270
|
+
# embeddings = results.get('embeddings', []) or []
|
271
|
+
|
272
|
+
embeddings = results.get('embeddings')
|
273
|
+
if isinstance(embeddings, np.ndarray):
|
274
|
+
pass # Keep as-is
|
275
|
+
elif embeddings is None:
|
276
|
+
embeddings = []
|
271
277
|
|
272
278
|
# Check if all lists have the same length
|
273
279
|
if not (len(docs) == len(metadatas) == len(embeddings)):
|
@@ -14,6 +14,7 @@ from langchain_core.embeddings import Embeddings
|
|
14
14
|
from langchain_core.vectorstores import VectorStore
|
15
15
|
from ddi_fw.utils import get_import
|
16
16
|
from langchain.document_loaders import DataFrameLoader
|
17
|
+
from collections import defaultdict
|
17
18
|
|
18
19
|
class BaseVectorStoreManager(BaseModel):
|
19
20
|
embeddings: Optional[Embeddings] = None
|
@@ -38,7 +39,7 @@ class BaseVectorStoreManager(BaseModel):
|
|
38
39
|
raise NotImplementedError("This method should be implemented by subclasses.")
|
39
40
|
|
40
41
|
@staticmethod
|
41
|
-
def get_persist_dir(base_dir, suffix, config=None):
|
42
|
+
def get_persist_dir(base_dir, id ,suffix, config=None):
|
42
43
|
raise NotImplementedError("Subclasses must implement get_persist_dir.")
|
43
44
|
|
44
45
|
|
@@ -65,8 +66,8 @@ class FaissVectorStoreManager(BaseVectorStoreManager):
|
|
65
66
|
# self.vector_store.add_documents(documents=docs, ids=uuids)
|
66
67
|
|
67
68
|
@staticmethod
|
68
|
-
def get_persist_dir(base_dir, suffix, config=None):
|
69
|
-
return f"{base_dir}/{suffix}"
|
69
|
+
def get_persist_dir(base_dir,id, suffix, config=None):
|
70
|
+
return f"{base_dir}/faiss/{id}/{suffix}"
|
70
71
|
|
71
72
|
def initialize_embedding_dict(self, **kwargs):
|
72
73
|
"""
|
@@ -79,9 +80,13 @@ class FaissVectorStoreManager(BaseVectorStoreManager):
|
|
79
80
|
self.load(self.persist_directory)
|
80
81
|
# df = self.as_dataframe(formatter_fn=custom_formatter)
|
81
82
|
df = self.as_dataframe(formatter_fn=custom_formatter)
|
82
|
-
type_dict =
|
83
|
-
|
84
|
-
|
83
|
+
type_dict = defaultdict(lambda: defaultdict(list))
|
84
|
+
|
85
|
+
grouped = df.groupby(['type', 'id'])['embedding'].apply(list)
|
86
|
+
|
87
|
+
for (drug_type, id), embeddings in grouped.items():
|
88
|
+
type_dict[drug_type][id] = embeddings
|
89
|
+
|
85
90
|
return type_dict
|
86
91
|
|
87
92
|
def generate_vector_store(self, docs, handle_empty='zero'):
|
@@ -436,7 +441,7 @@ def generate_embeddings(
|
|
436
441
|
print(f"{id}_{suffix}")
|
437
442
|
# persist_dir = f'{persist_directory}/{id}/{suffix}'
|
438
443
|
# persist_dir = f'{persist_directory}/{suffix}'
|
439
|
-
persist_dir = vector_store_manager_type.get_persist_dir(persist_directory, suffix, collection_config)
|
444
|
+
persist_dir = vector_store_manager_type.get_persist_dir(persist_directory , id, suffix, collection_config)
|
440
445
|
|
441
446
|
# Prepare manager parameters
|
442
447
|
manager_params = {
|
@@ -0,0 +1,17 @@
|
|
1
|
+
from typing import List
|
2
|
+
import nltk
|
3
|
+
from nltk import sent_tokenize
|
4
|
+
from langchain_text_splitters.base import TextSplitter
|
5
|
+
|
6
|
+
nltk.download('punkt')
|
7
|
+
|
8
|
+
''' A text splitter that splits text into sentences using NLTK's sentence tokenizer.'''
|
9
|
+
class SentenceSplitter(TextSplitter):
|
10
|
+
def split_text(self, text: str) -> List[str]:
|
11
|
+
return sent_tokenize(text)
|
12
|
+
|
13
|
+
|
14
|
+
''' A text splitter that does not split the text at all, returning the entire text as a single chunk.'''
|
15
|
+
class PassthroughTextSplitter(TextSplitter):
|
16
|
+
def split_text(self, text: str) -> List[str]:
|
17
|
+
return [text]
|
@@ -1,10 +0,0 @@
|
|
1
|
-
from typing import List
|
2
|
-
import nltk
|
3
|
-
from nltk import sent_tokenize
|
4
|
-
from langchain_text_splitters.base import TextSplitter
|
5
|
-
|
6
|
-
nltk.download('punkt')
|
7
|
-
|
8
|
-
class SentenceSplitter(TextSplitter):
|
9
|
-
def split_text(self, text: str) -> List[str]:
|
10
|
-
return sent_tokenize(text)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|