ddi-fw 0.0.157__py3-none-any.whl → 0.0.159__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +0 -3
- ddi_fw/datasets/core.py +14 -2
- ddi_fw/datasets/ddi_mdl/base.py +31 -29
- ddi_fw/langchain/embeddings.py +4 -29
- ddi_fw/pipeline/ner_pipeline.py +1 -1
- ddi_fw/vectorization/__init__.py +2 -0
- {ddi_fw-0.0.157.dist-info → ddi_fw-0.0.159.dist-info}/METADATA +5 -13
- {ddi_fw-0.0.157.dist-info → ddi_fw-0.0.159.dist-info}/RECORD +12 -12
- ddi_fw/datasets/embedding_generator.py +0 -67
- /ddi_fw/{datasets → vectorization}/feature_vector_generation.py +0 -0
- /ddi_fw/{datasets → vectorization}/idf_helper.py +0 -0
- {ddi_fw-0.0.157.dist-info → ddi_fw-0.0.159.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.157.dist-info → ddi_fw-0.0.159.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/__init__.py
CHANGED
@@ -2,9 +2,6 @@ from .core import BaseDataset
|
|
2
2
|
from .ddi_mdl.base import DDIMDLDataset
|
3
3
|
from .ddi_mdl_text.base import DDIMDLDatasetV2
|
4
4
|
from .mdf_sa_ddi.base import MDFSADDIDataset
|
5
|
-
from .embedding_generator import create_embeddings
|
6
|
-
from .idf_helper import IDF
|
7
|
-
from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
|
8
5
|
from .dataset_splitter import DatasetSplitter
|
9
6
|
__all__ = ['BaseDataset','DDIMDLDataset','MDFSADDIDataset']
|
10
7
|
|
ddi_fw/datasets/core.py
CHANGED
@@ -4,11 +4,23 @@ import numpy as np
|
|
4
4
|
import pandas as pd
|
5
5
|
from pydantic import BaseModel, Field, computed_field
|
6
6
|
from ddi_fw.datasets.dataset_splitter import DatasetSplitter
|
7
|
-
from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
|
8
|
-
from ddi_fw.langchain.embeddings import PoolingStrategy
|
9
7
|
from ddi_fw.utils.utils import create_folder_if_not_exists
|
10
8
|
|
11
9
|
|
10
|
+
try:
|
11
|
+
from ddi_fw.vectorization import SimilarityMatrixGenerator, VectorGenerator
|
12
|
+
except ImportError:
|
13
|
+
raise ImportError(
|
14
|
+
"Failed to import vectorization module. Ensure that the module exists and is correctly installed. ")
|
15
|
+
|
16
|
+
try:
|
17
|
+
from ddi_fw.langchain.embeddings import PoolingStrategy
|
18
|
+
except ImportError:
|
19
|
+
raise ImportError(
|
20
|
+
"Failed to import langchain.embeddings module. ")
|
21
|
+
|
22
|
+
|
23
|
+
|
12
24
|
def stack(df_column):
|
13
25
|
return np.stack(df_column.values)
|
14
26
|
|
ddi_fw/datasets/ddi_mdl/base.py
CHANGED
@@ -1,22 +1,21 @@
|
|
1
|
-
import glob
|
2
1
|
import pathlib
|
3
2
|
from typing import List, Optional, Tuple
|
4
|
-
from ddi_fw.datasets.core import
|
5
|
-
from ddi_fw.datasets.dataset_splitter import DatasetSplitter
|
3
|
+
from ddi_fw.datasets.core import TextDatasetMixin, generate_sim_matrices_new, generate_vectors
|
6
4
|
from ddi_fw.datasets.db_utils import create_connection
|
7
|
-
from ddi_fw.datasets.idf_helper import IDF
|
8
|
-
from ddi_fw.utils.utils import create_folder_if_not_exists
|
9
5
|
import numpy as np
|
10
6
|
import pandas as pd
|
11
7
|
from pydantic import BaseModel, Field, model_validator, root_validator
|
12
|
-
from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator,VectorGenerator
|
13
|
-
from ddi_fw.langchain.embeddings import PoolingStrategy
|
14
8
|
from abc import ABC, abstractmethod
|
15
9
|
from sklearn.preprocessing import LabelBinarizer
|
16
|
-
|
17
|
-
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
18
10
|
import logging
|
19
11
|
|
12
|
+
|
13
|
+
try:
|
14
|
+
from ddi_fw.vectorization import IDF
|
15
|
+
except ImportError:
|
16
|
+
raise ImportError(
|
17
|
+
"Failed to import vectorization module. Ensure that the module exists and is correctly installed. ")
|
18
|
+
|
20
19
|
logger = logging.getLogger(__name__)
|
21
20
|
|
22
21
|
# Constants for embedding, chemical properties, and NER columns
|
@@ -32,14 +31,15 @@ LIST_OF_NER_COLUMNS = ['tui', 'cui', 'entities']
|
|
32
31
|
|
33
32
|
HERE = pathlib.Path(__file__).resolve().parent
|
34
33
|
|
34
|
+
|
35
35
|
class DDIMDLDataset(TextDatasetMixin):
|
36
36
|
dataset_name: str = "DDIMDLDataset"
|
37
37
|
index_path: str = Field(default_factory=lambda: str(
|
38
38
|
pathlib.Path(__file__).resolve().parent.joinpath('indexes')))
|
39
39
|
# drugs_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
|
40
40
|
# ddis_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
|
41
|
-
drugs_df: Optional[pd.DataFrame] = None
|
42
|
-
ddis_df: Optional[pd.DataFrame] = None
|
41
|
+
drugs_df: Optional[pd.DataFrame] = None
|
42
|
+
ddis_df: Optional[pd.DataFrame] = None
|
43
43
|
|
44
44
|
chemical_property_columns: list[str] = Field(
|
45
45
|
default_factory=lambda: LIST_OF_CHEMICAL_PROPERTY_COLUMNS)
|
@@ -50,8 +50,8 @@ class DDIMDLDataset(TextDatasetMixin):
|
|
50
50
|
cui_threshold: float | None = None
|
51
51
|
entities_threshold: float | None = None
|
52
52
|
|
53
|
-
|
54
53
|
# @model_validator
|
54
|
+
|
55
55
|
def validate_columns(self, values):
|
56
56
|
if not set(values['chemical_property_columns']).issubset(LIST_OF_CHEMICAL_PROPERTY_COLUMNS):
|
57
57
|
raise ValueError("Invalid chemical property columns")
|
@@ -64,9 +64,10 @@ class DDIMDLDataset(TextDatasetMixin):
|
|
64
64
|
super().__init__(**kwargs)
|
65
65
|
self.class_column = 'event_category'
|
66
66
|
_db_path = HERE.joinpath('data/event.db')
|
67
|
-
|
67
|
+
|
68
68
|
self.__similarity_related_columns__ = []
|
69
|
-
self.__similarity_related_columns__.extend(
|
69
|
+
self.__similarity_related_columns__.extend(
|
70
|
+
self.chemical_property_columns)
|
70
71
|
self.__similarity_related_columns__.extend(self.ner_columns)
|
71
72
|
# TODO with resource
|
72
73
|
self._conn = create_connection(_db_path.absolute().as_posix())
|
@@ -112,9 +113,9 @@ class DDIMDLDataset(TextDatasetMixin):
|
|
112
113
|
def prep(self):
|
113
114
|
if self.drugs_df is None or self.ddis_df is None:
|
114
115
|
raise Exception("There is no data")
|
115
|
-
|
116
|
+
|
116
117
|
drug_ids = self.drugs_df['id'].to_list()
|
117
|
-
|
118
|
+
|
118
119
|
filtered_df = self.drugs_df
|
119
120
|
combined_df = filtered_df.copy()
|
120
121
|
|
@@ -140,11 +141,12 @@ class DDIMDLDataset(TextDatasetMixin):
|
|
140
141
|
if key.startswith('entities'):
|
141
142
|
threshold = self.entities_threshold
|
142
143
|
combined_df[key] = filtered_ner_df[key]
|
143
|
-
valid_codes = idf_scores_df[idf_scores_df[key]
|
144
|
+
valid_codes = idf_scores_df[idf_scores_df[key]
|
145
|
+
> threshold].index
|
144
146
|
|
145
147
|
# print(f'{key}: valid code size = {len(valid_codes)}')
|
146
148
|
combined_df[key] = combined_df[key].apply(lambda items:
|
147
|
-
|
149
|
+
[item for item in items if item in valid_codes])
|
148
150
|
|
149
151
|
moved_columns = ['id']
|
150
152
|
moved_columns.extend(self.__similarity_related_columns__)
|
@@ -153,28 +155,29 @@ class DDIMDLDataset(TextDatasetMixin):
|
|
153
155
|
chemical_properties_df = chemical_properties_df.fillna("").apply(list)
|
154
156
|
|
155
157
|
# generate vectors dictionary içinde ndarray dönecek
|
156
|
-
generated_vectors = generate_vectors(
|
158
|
+
generated_vectors = generate_vectors(
|
159
|
+
chemical_properties_df, self.__similarity_related_columns__)
|
157
160
|
|
158
161
|
similarity_matrices = generate_sim_matrices_new(
|
159
|
-
chemical_properties_df,generated_vectors, self.__similarity_related_columns__, key_column=
|
162
|
+
chemical_properties_df, generated_vectors, self.__similarity_related_columns__, key_column="id")
|
160
163
|
|
161
164
|
event_categories = self.ddis_df['event_category']
|
162
165
|
labels = event_categories.tolist()
|
163
166
|
lb = LabelBinarizer()
|
164
167
|
lb.fit(labels)
|
165
168
|
classes = lb.transform(labels)
|
166
|
-
|
169
|
+
|
167
170
|
def similarity_lambda_fnc(row, value):
|
168
171
|
if row['id1'] in value:
|
169
172
|
return value[row['id1']]
|
170
173
|
|
171
|
-
def lambda_fnc(row: pd.Series, value)-> Optional[np.float16]:
|
174
|
+
def lambda_fnc(row: pd.Series, value) -> Optional[np.float16]:
|
172
175
|
if row['id1'] in value and row['id2'] in value:
|
173
176
|
return np.float16(np.hstack(
|
174
177
|
(value[row['id1']], value[row['id2']])))
|
175
178
|
return None
|
176
|
-
|
177
|
-
|
179
|
+
# return np.hstack(
|
180
|
+
# (value[row['id1']], value[row['id2']]), dtype=np.float16)
|
178
181
|
|
179
182
|
def x_fnc(row, embeddings_after_pooling):
|
180
183
|
if row['id1'] in embeddings_after_pooling:
|
@@ -207,13 +210,12 @@ class DDIMDLDataset(TextDatasetMixin):
|
|
207
210
|
|
208
211
|
dataframe = self.ddis_df.copy()
|
209
212
|
if not isinstance(classes, (list, pd.Series, np.ndarray)):
|
210
|
-
|
213
|
+
raise TypeError(
|
214
|
+
"classes must be an iterable (list, Series, or ndarray)")
|
211
215
|
|
212
216
|
if len(classes) != len(dataframe):
|
213
|
-
raise ValueError(
|
217
|
+
raise ValueError(
|
218
|
+
"Length of classes must match the number of rows in the DataFrame")
|
214
219
|
|
215
220
|
dataframe['class'] = list(classes)
|
216
221
|
self.set_dataframe(dataframe)
|
217
|
-
|
218
|
-
|
219
|
-
|
ddi_fw/langchain/embeddings.py
CHANGED
@@ -1,36 +1,11 @@
|
|
1
|
-
# !pip install -U sentence-transformers
|
2
|
-
|
3
|
-
# from transformers import BertTokenizer,BertForPreTraining,BertModel
|
4
|
-
# from sentence_transformers import SentenceTransformer, util
|
5
|
-
import pandas as pd
|
6
1
|
import numpy as np
|
7
|
-
from nltk import sent_tokenize
|
8
|
-
import torch
|
9
|
-
from tqdm import tqdm
|
10
|
-
|
11
|
-
|
12
|
-
from collections import defaultdict
|
13
|
-
from functools import partial
|
14
|
-
from abc import ABC, abstractmethod
|
15
2
|
from transformers import AutoModel, AutoTokenizer
|
16
|
-
from sentence_transformers import SentenceTransformer
|
17
|
-
|
18
|
-
from typing import Any, Dict, List, Optional
|
3
|
+
from sentence_transformers import SentenceTransformer
|
4
|
+
from typing import Any, List
|
19
5
|
from langchain_core.embeddings import Embeddings
|
20
|
-
from pydantic import BaseModel, ConfigDict,
|
6
|
+
from pydantic import BaseModel, ConfigDict, computed_field
|
21
7
|
from langchain.embeddings import SentenceTransformerEmbeddings
|
22
|
-
|
23
|
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
24
|
-
from langchain_community.vectorstores import Chroma
|
25
|
-
import chromadb
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
# def split_docs(documents, chunk_size=1000, chunk_overlap=20):
|
30
|
-
# text_splitter = RecursiveCharacterTextSplitter(
|
31
|
-
# chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
32
|
-
# docs = text_splitter.split_documents(documents)
|
33
|
-
# return docs
|
8
|
+
|
34
9
|
|
35
10
|
class PoolingStrategy():
|
36
11
|
def __init__(self):
|
ddi_fw/pipeline/ner_pipeline.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from collections import defaultdict
|
2
2
|
import numpy as np
|
3
3
|
from ddi_fw.datasets.core import BaseDataset
|
4
|
-
from ddi_fw.
|
4
|
+
from ddi_fw.vectorization.idf_helper import IDF
|
5
5
|
from typing import Any, Dict, List, Optional
|
6
6
|
from itertools import product
|
7
7
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ddi_fw
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.159
|
4
4
|
Summary: Do not use :)
|
5
5
|
Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
6
6
|
Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
|
@@ -8,7 +8,6 @@ Keywords: Machine Learning
|
|
8
8
|
Classifier: Development Status :: 1 - Planning
|
9
9
|
Classifier: Environment :: Console
|
10
10
|
Classifier: Intended Audience :: Science/Research
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
12
11
|
Classifier: Operating System :: OS Independent
|
13
12
|
Classifier: Framework :: Pytest
|
14
13
|
Classifier: Framework :: tox
|
@@ -22,8 +21,10 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
21
|
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
23
22
|
Requires-Python: >=3.10
|
24
23
|
Description-Content-Type: text/markdown
|
24
|
+
Requires-Dist: pydantic==2.10.6
|
25
25
|
Requires-Dist: tqdm>=4.66.6
|
26
26
|
Requires-Dist: pandas>=2.2.0
|
27
|
+
Requires-Dist: py7zr==0.22.0
|
27
28
|
Provides-Extra: llm
|
28
29
|
Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1; extra == "llm"
|
29
30
|
Requires-Dist: transformers>=4.42.4; extra == "llm"
|
@@ -32,6 +33,7 @@ Requires-Dist: tokenizers>=0.19.1; extra == "llm"
|
|
32
33
|
Requires-Dist: openai>=1.52.2; extra == "llm"
|
33
34
|
Requires-Dist: langchain>=0.3.4; extra == "llm"
|
34
35
|
Requires-Dist: langchain_community==0.3.3; extra == "llm"
|
36
|
+
Requires-Dist: chromadb>=0.5.15; extra == "llm"
|
35
37
|
Provides-Extra: ml
|
36
38
|
Requires-Dist: scikit-learn==1.5.2; extra == "ml"
|
37
39
|
Requires-Dist: tensorflow<2.18.0,>=2.17.0; extra == "ml"
|
@@ -43,17 +45,15 @@ Requires-Dist: scipy==1.13.1; extra == "ml"
|
|
43
45
|
Requires-Dist: pandas>=2.2.0; extra == "ml"
|
44
46
|
Requires-Dist: plotly==5.24.1; extra == "ml"
|
45
47
|
Requires-Dist: matplotlib==3.8.0; extra == "ml"
|
46
|
-
Requires-Dist: rdkit==2023.3.3; extra == "ml"
|
47
48
|
Requires-Dist: datasets==3.0.2; extra == "ml"
|
48
49
|
Requires-Dist: tqdm>=4.66.6; extra == "ml"
|
49
50
|
Provides-Extra: datasets
|
50
51
|
Requires-Dist: datasets==3.0.2; extra == "datasets"
|
51
52
|
Requires-Dist: unstructured==0.16.3; extra == "datasets"
|
52
|
-
Requires-Dist: py7zr==0.22.0; extra == "datasets"
|
53
53
|
Requires-Dist: xmlschema==3.4.2; extra == "datasets"
|
54
54
|
Provides-Extra: drugbank
|
55
55
|
Requires-Dist: rdkit==2023.3.3; extra == "drugbank"
|
56
|
-
Requires-Dist:
|
56
|
+
Requires-Dist: xmlschema==3.4.2; extra == "drugbank"
|
57
57
|
Provides-Extra: pipeline-and-ner
|
58
58
|
Requires-Dist: nltk>=3.8.1; extra == "pipeline-and-ner"
|
59
59
|
Requires-Dist: stanza==1.9.2; extra == "pipeline-and-ner"
|
@@ -61,16 +61,8 @@ Requires-Dist: transformers>=4.42.4; extra == "pipeline-and-ner"
|
|
61
61
|
Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1; extra == "pipeline-and-ner"
|
62
62
|
Requires-Dist: mlflow==2.16.1; extra == "pipeline-and-ner"
|
63
63
|
Provides-Extra: utils
|
64
|
-
Requires-Dist: pydantic==2.10.6; extra == "utils"
|
65
64
|
Requires-Dist: python-stopwatch==1.1.11; extra == "utils"
|
66
65
|
Requires-Dist: importlib-resources==6.4.5; extra == "utils"
|
67
66
|
Requires-Dist: lxml==5.3.0; extra == "utils"
|
68
67
|
Requires-Dist: pyarrow==17.0.0; extra == "utils"
|
69
68
|
Requires-Dist: pycryptodomex==3.22.0; extra == "utils"
|
70
|
-
Requires-Dist: pydantic-settings-2.8.1; extra == "utils"
|
71
|
-
Requires-Dist: python-dotenv-1.1.0; extra == "utils"
|
72
|
-
Requires-Dist: python-iso639-2025.2.18; extra == "utils"
|
73
|
-
Requires-Dist: python-magic-0.4.27; extra == "utils"
|
74
|
-
Requires-Dist: pyzstd==0.16.2; extra == "utils"
|
75
|
-
Requires-Dist: databricks-sdk-0.47.0; extra == "utils"
|
76
|
-
Requires-Dist: python-tml-1.0.2; extra == "utils"
|
@@ -1,12 +1,9 @@
|
|
1
|
-
ddi_fw/datasets/__init__.py,sha256=
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
1
|
+
ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
|
2
|
+
ddi_fw/datasets/core.py,sha256=gmasNdwohZ9Cd1qqhzijoTgX8VHQyzA0aBVtgjLQago,8344
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
5
|
-
ddi_fw/datasets/embedding_generator.py,sha256=jiDKwLaPMaQkloxQkuCrhl-A-2OdvocmkSzjWtUnk4g,2255
|
6
|
-
ddi_fw/datasets/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
|
7
|
-
ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
8
5
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
9
|
-
ddi_fw/datasets/ddi_mdl/base.py,sha256=
|
6
|
+
ddi_fw/datasets/ddi_mdl/base.py,sha256=bdcGmEbY_2Fe8fg0pKxfMuDopgaPUTUfQasCy8Bhcvc,9313
|
10
7
|
ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
|
11
8
|
ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
|
12
9
|
ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
|
@@ -72,7 +69,7 @@ ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9P
|
|
72
69
|
ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
|
73
70
|
ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
|
74
71
|
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
75
|
-
ddi_fw/langchain/embeddings.py,sha256=
|
72
|
+
ddi_fw/langchain/embeddings.py,sha256=eEWy4okcjdhUJHi4N48Wd8XauPXyeaQVLUdNWEvtEcY,6754
|
76
73
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
77
74
|
ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
|
78
75
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
@@ -87,7 +84,7 @@ ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
|
|
87
84
|
ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
|
88
85
|
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
|
89
86
|
ddi_fw/pipeline/multi_pipeline.py,sha256=NfcH4Ze5U-JRiH3lrxEDWj-VPxYQYtp7tq6bLCImBzs,5550
|
90
|
-
ddi_fw/pipeline/ner_pipeline.py,sha256=
|
87
|
+
ddi_fw/pipeline/ner_pipeline.py,sha256=kNGtkg5rNX5MDywzvRxmvyk-DxXAjEbYzZkp8pNlAZo,6023
|
91
88
|
ddi_fw/pipeline/pipeline.py,sha256=70lYsluAnTWDLTlf6rbecffw3Bl34L1_6ALfLUoSvtY,11324
|
92
89
|
ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
|
93
90
|
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
@@ -97,7 +94,10 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
|
|
97
94
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
98
95
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
99
96
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
100
|
-
ddi_fw
|
101
|
-
ddi_fw
|
102
|
-
ddi_fw
|
103
|
-
ddi_fw-0.0.
|
97
|
+
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
98
|
+
ddi_fw/vectorization/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
|
99
|
+
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
100
|
+
ddi_fw-0.0.159.dist-info/METADATA,sha256=Ai8ONw45d5f5yEd3_SaNEKBew73TxPR0nrc75J4U0Ck,3145
|
101
|
+
ddi_fw-0.0.159.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
102
|
+
ddi_fw-0.0.159.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
103
|
+
ddi_fw-0.0.159.dist-info/RECORD,,
|
@@ -1,67 +0,0 @@
|
|
1
|
-
# !pip install -U sentence-transformers
|
2
|
-
|
3
|
-
# from transformers import BertTokenizer,BertForPreTraining,BertModel
|
4
|
-
# from sentence_transformers import SentenceTransformer, util
|
5
|
-
import pandas as pd
|
6
|
-
import numpy as np
|
7
|
-
import nltk
|
8
|
-
from nltk import sent_tokenize
|
9
|
-
from tqdm import tqdm
|
10
|
-
|
11
|
-
|
12
|
-
nltk.download('punkt')
|
13
|
-
|
14
|
-
import os
|
15
|
-
def check_file_exists(path):
|
16
|
-
return os.path.isdir(path)
|
17
|
-
|
18
|
-
def get_model_name_or_local_path(model_local_path, model_name):
|
19
|
-
if check_file_exists(model_local_path):
|
20
|
-
return model_local_path
|
21
|
-
return model_name
|
22
|
-
|
23
|
-
import re
|
24
|
-
def process_text(text):
|
25
|
-
text = re.sub("\[L\d*\]", "",text)
|
26
|
-
text = text.replace("[","")
|
27
|
-
text = text.replace("]","")
|
28
|
-
return text
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
from collections import defaultdict
|
33
|
-
from functools import partial
|
34
|
-
|
35
|
-
# NOT modelden input size'ı anlama,
|
36
|
-
def create_embeddings(model, data, column, drop_column=True):
|
37
|
-
# model._modules['1'].get_sentence_embedding_dimension()
|
38
|
-
# shape = (1,model._modules['0'].get_word_embedding_dimension())
|
39
|
-
shape = model._modules['0'].get_word_embedding_dimension()
|
40
|
-
column_embeddings_dict = defaultdict(lambda: np.zeros(shape))
|
41
|
-
for index, row in tqdm(data.iterrows()):
|
42
|
-
# if index == 10:
|
43
|
-
# break
|
44
|
-
text = data[column][index]
|
45
|
-
# else'de zero
|
46
|
-
if text == None or type(text) != str:
|
47
|
-
embeddings = None
|
48
|
-
else:
|
49
|
-
sentences = sent_tokenize(text)
|
50
|
-
embeddings = model.encode(sentences)
|
51
|
-
|
52
|
-
#TODO benzer olan ilacın embedding değerini vererek dene
|
53
|
-
if embeddings is None or len(embeddings) == 0: #embedding check none type
|
54
|
-
sum_of_embeddings = np.zeros(shape)
|
55
|
-
else:
|
56
|
-
sum_of_embeddings = np.sum(embeddings, axis = 0)
|
57
|
-
# column_embeddings_dict[row['id']] = sum_of_embeddings.reshape(1, -1) # 2d
|
58
|
-
column_embeddings_dict[row['id']] = sum_of_embeddings
|
59
|
-
# data.iloc[index][column+'_embedding']=sum_of_embeddings
|
60
|
-
|
61
|
-
# data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
|
62
|
-
data[column+'_embedding'] = pd.Series(list(column_embeddings_dict.values()))
|
63
|
-
if(drop_column):
|
64
|
-
data.drop([column], axis = 1, inplace = True)
|
65
|
-
# data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
|
66
|
-
return column_embeddings_dict
|
67
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|