ddi-fw 0.0.157__py3-none-any.whl → 0.0.159__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,9 +2,6 @@ from .core import BaseDataset
2
2
  from .ddi_mdl.base import DDIMDLDataset
3
3
  from .ddi_mdl_text.base import DDIMDLDatasetV2
4
4
  from .mdf_sa_ddi.base import MDFSADDIDataset
5
- from .embedding_generator import create_embeddings
6
- from .idf_helper import IDF
7
- from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
8
5
  from .dataset_splitter import DatasetSplitter
9
6
  __all__ = ['BaseDataset','DDIMDLDataset','MDFSADDIDataset']
10
7
 
ddi_fw/datasets/core.py CHANGED
@@ -4,11 +4,23 @@ import numpy as np
4
4
  import pandas as pd
5
5
  from pydantic import BaseModel, Field, computed_field
6
6
  from ddi_fw.datasets.dataset_splitter import DatasetSplitter
7
- from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
8
- from ddi_fw.langchain.embeddings import PoolingStrategy
9
7
  from ddi_fw.utils.utils import create_folder_if_not_exists
10
8
 
11
9
 
10
+ try:
11
+ from ddi_fw.vectorization import SimilarityMatrixGenerator, VectorGenerator
12
+ except ImportError:
13
+ raise ImportError(
14
+ "Failed to import vectorization module. Ensure that the module exists and is correctly installed. ")
15
+
16
+ try:
17
+ from ddi_fw.langchain.embeddings import PoolingStrategy
18
+ except ImportError:
19
+ raise ImportError(
20
+ "Failed to import langchain.embeddings module. ")
21
+
22
+
23
+
12
24
  def stack(df_column):
13
25
  return np.stack(df_column.values)
14
26
 
@@ -1,22 +1,21 @@
1
- import glob
2
1
  import pathlib
3
2
  from typing import List, Optional, Tuple
4
- from ddi_fw.datasets.core import BaseDataset, TextDatasetMixin, generate_sim_matrices_new, generate_vectors
5
- from ddi_fw.datasets.dataset_splitter import DatasetSplitter
3
+ from ddi_fw.datasets.core import TextDatasetMixin, generate_sim_matrices_new, generate_vectors
6
4
  from ddi_fw.datasets.db_utils import create_connection
7
- from ddi_fw.datasets.idf_helper import IDF
8
- from ddi_fw.utils.utils import create_folder_if_not_exists
9
5
  import numpy as np
10
6
  import pandas as pd
11
7
  from pydantic import BaseModel, Field, model_validator, root_validator
12
- from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator,VectorGenerator
13
- from ddi_fw.langchain.embeddings import PoolingStrategy
14
8
  from abc import ABC, abstractmethod
15
9
  from sklearn.preprocessing import LabelBinarizer
16
-
17
- from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
18
10
  import logging
19
11
 
12
+
13
+ try:
14
+ from ddi_fw.vectorization import IDF
15
+ except ImportError:
16
+ raise ImportError(
17
+ "Failed to import vectorization module. Ensure that the module exists and is correctly installed. ")
18
+
20
19
  logger = logging.getLogger(__name__)
21
20
 
22
21
  # Constants for embedding, chemical properties, and NER columns
@@ -32,14 +31,15 @@ LIST_OF_NER_COLUMNS = ['tui', 'cui', 'entities']
32
31
 
33
32
  HERE = pathlib.Path(__file__).resolve().parent
34
33
 
34
+
35
35
  class DDIMDLDataset(TextDatasetMixin):
36
36
  dataset_name: str = "DDIMDLDataset"
37
37
  index_path: str = Field(default_factory=lambda: str(
38
38
  pathlib.Path(__file__).resolve().parent.joinpath('indexes')))
39
39
  # drugs_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
40
40
  # ddis_df: pd.DataFrame = Field(default_factory=pd.DataFrame)
41
- drugs_df: Optional[pd.DataFrame] = None
42
- ddis_df: Optional[pd.DataFrame] = None
41
+ drugs_df: Optional[pd.DataFrame] = None
42
+ ddis_df: Optional[pd.DataFrame] = None
43
43
 
44
44
  chemical_property_columns: list[str] = Field(
45
45
  default_factory=lambda: LIST_OF_CHEMICAL_PROPERTY_COLUMNS)
@@ -50,8 +50,8 @@ class DDIMDLDataset(TextDatasetMixin):
50
50
  cui_threshold: float | None = None
51
51
  entities_threshold: float | None = None
52
52
 
53
-
54
53
  # @model_validator
54
+
55
55
  def validate_columns(self, values):
56
56
  if not set(values['chemical_property_columns']).issubset(LIST_OF_CHEMICAL_PROPERTY_COLUMNS):
57
57
  raise ValueError("Invalid chemical property columns")
@@ -64,9 +64,10 @@ class DDIMDLDataset(TextDatasetMixin):
64
64
  super().__init__(**kwargs)
65
65
  self.class_column = 'event_category'
66
66
  _db_path = HERE.joinpath('data/event.db')
67
-
67
+
68
68
  self.__similarity_related_columns__ = []
69
- self.__similarity_related_columns__.extend(self.chemical_property_columns)
69
+ self.__similarity_related_columns__.extend(
70
+ self.chemical_property_columns)
70
71
  self.__similarity_related_columns__.extend(self.ner_columns)
71
72
  # TODO with resource
72
73
  self._conn = create_connection(_db_path.absolute().as_posix())
@@ -112,9 +113,9 @@ class DDIMDLDataset(TextDatasetMixin):
112
113
  def prep(self):
113
114
  if self.drugs_df is None or self.ddis_df is None:
114
115
  raise Exception("There is no data")
115
-
116
+
116
117
  drug_ids = self.drugs_df['id'].to_list()
117
-
118
+
118
119
  filtered_df = self.drugs_df
119
120
  combined_df = filtered_df.copy()
120
121
 
@@ -140,11 +141,12 @@ class DDIMDLDataset(TextDatasetMixin):
140
141
  if key.startswith('entities'):
141
142
  threshold = self.entities_threshold
142
143
  combined_df[key] = filtered_ner_df[key]
143
- valid_codes = idf_scores_df[idf_scores_df[key] > threshold].index
144
+ valid_codes = idf_scores_df[idf_scores_df[key]
145
+ > threshold].index
144
146
 
145
147
  # print(f'{key}: valid code size = {len(valid_codes)}')
146
148
  combined_df[key] = combined_df[key].apply(lambda items:
147
- [item for item in items if item in valid_codes])
149
+ [item for item in items if item in valid_codes])
148
150
 
149
151
  moved_columns = ['id']
150
152
  moved_columns.extend(self.__similarity_related_columns__)
@@ -153,28 +155,29 @@ class DDIMDLDataset(TextDatasetMixin):
153
155
  chemical_properties_df = chemical_properties_df.fillna("").apply(list)
154
156
 
155
157
  # generate vectors dictionary içinde ndarray dönecek
156
- generated_vectors = generate_vectors(chemical_properties_df, self.__similarity_related_columns__)
158
+ generated_vectors = generate_vectors(
159
+ chemical_properties_df, self.__similarity_related_columns__)
157
160
 
158
161
  similarity_matrices = generate_sim_matrices_new(
159
- chemical_properties_df,generated_vectors, self.__similarity_related_columns__, key_column= "id")
162
+ chemical_properties_df, generated_vectors, self.__similarity_related_columns__, key_column="id")
160
163
 
161
164
  event_categories = self.ddis_df['event_category']
162
165
  labels = event_categories.tolist()
163
166
  lb = LabelBinarizer()
164
167
  lb.fit(labels)
165
168
  classes = lb.transform(labels)
166
-
169
+
167
170
  def similarity_lambda_fnc(row, value):
168
171
  if row['id1'] in value:
169
172
  return value[row['id1']]
170
173
 
171
- def lambda_fnc(row: pd.Series, value)-> Optional[np.float16]:
174
+ def lambda_fnc(row: pd.Series, value) -> Optional[np.float16]:
172
175
  if row['id1'] in value and row['id2'] in value:
173
176
  return np.float16(np.hstack(
174
177
  (value[row['id1']], value[row['id2']])))
175
178
  return None
176
- # return np.hstack(
177
- # (value[row['id1']], value[row['id2']]), dtype=np.float16)
179
+ # return np.hstack(
180
+ # (value[row['id1']], value[row['id2']]), dtype=np.float16)
178
181
 
179
182
  def x_fnc(row, embeddings_after_pooling):
180
183
  if row['id1'] in embeddings_after_pooling:
@@ -207,13 +210,12 @@ class DDIMDLDataset(TextDatasetMixin):
207
210
 
208
211
  dataframe = self.ddis_df.copy()
209
212
  if not isinstance(classes, (list, pd.Series, np.ndarray)):
210
- raise TypeError("classes must be an iterable (list, Series, or ndarray)")
213
+ raise TypeError(
214
+ "classes must be an iterable (list, Series, or ndarray)")
211
215
 
212
216
  if len(classes) != len(dataframe):
213
- raise ValueError("Length of classes must match the number of rows in the DataFrame")
217
+ raise ValueError(
218
+ "Length of classes must match the number of rows in the DataFrame")
214
219
 
215
220
  dataframe['class'] = list(classes)
216
221
  self.set_dataframe(dataframe)
217
-
218
-
219
-
@@ -1,36 +1,11 @@
1
- # !pip install -U sentence-transformers
2
-
3
- # from transformers import BertTokenizer,BertForPreTraining,BertModel
4
- # from sentence_transformers import SentenceTransformer, util
5
- import pandas as pd
6
1
  import numpy as np
7
- from nltk import sent_tokenize
8
- import torch
9
- from tqdm import tqdm
10
-
11
-
12
- from collections import defaultdict
13
- from functools import partial
14
- from abc import ABC, abstractmethod
15
2
  from transformers import AutoModel, AutoTokenizer
16
- from sentence_transformers import SentenceTransformer, util
17
-
18
- from typing import Any, Dict, List, Optional
3
+ from sentence_transformers import SentenceTransformer
4
+ from typing import Any, List
19
5
  from langchain_core.embeddings import Embeddings
20
- from pydantic import BaseModel, ConfigDict, Field, SecretStr, computed_field
6
+ from pydantic import BaseModel, ConfigDict, computed_field
21
7
  from langchain.embeddings import SentenceTransformerEmbeddings
22
-
23
- from langchain.text_splitter import RecursiveCharacterTextSplitter
24
- from langchain_community.vectorstores import Chroma
25
- import chromadb
26
-
27
-
28
-
29
- # def split_docs(documents, chunk_size=1000, chunk_overlap=20):
30
- # text_splitter = RecursiveCharacterTextSplitter(
31
- # chunk_size=chunk_size, chunk_overlap=chunk_overlap)
32
- # docs = text_splitter.split_documents(documents)
33
- # return docs
8
+
34
9
 
35
10
  class PoolingStrategy():
36
11
  def __init__(self):
@@ -1,7 +1,7 @@
1
1
  from collections import defaultdict
2
2
  import numpy as np
3
3
  from ddi_fw.datasets.core import BaseDataset
4
- from ddi_fw.datasets.idf_helper import IDF
4
+ from ddi_fw.vectorization.idf_helper import IDF
5
5
  from typing import Any, Dict, List, Optional
6
6
  from itertools import product
7
7
 
@@ -0,0 +1,2 @@
1
+ from .idf_helper import IDF
2
+ from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.157
3
+ Version: 0.0.159
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -8,7 +8,6 @@ Keywords: Machine Learning
8
8
  Classifier: Development Status :: 1 - Planning
9
9
  Classifier: Environment :: Console
10
10
  Classifier: Intended Audience :: Science/Research
11
- Classifier: License :: OSI Approved :: MIT License
12
11
  Classifier: Operating System :: OS Independent
13
12
  Classifier: Framework :: Pytest
14
13
  Classifier: Framework :: tox
@@ -22,8 +21,10 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
21
  Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
23
22
  Requires-Python: >=3.10
24
23
  Description-Content-Type: text/markdown
24
+ Requires-Dist: pydantic==2.10.6
25
25
  Requires-Dist: tqdm>=4.66.6
26
26
  Requires-Dist: pandas>=2.2.0
27
+ Requires-Dist: py7zr==0.22.0
27
28
  Provides-Extra: llm
28
29
  Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1; extra == "llm"
29
30
  Requires-Dist: transformers>=4.42.4; extra == "llm"
@@ -32,6 +33,7 @@ Requires-Dist: tokenizers>=0.19.1; extra == "llm"
32
33
  Requires-Dist: openai>=1.52.2; extra == "llm"
33
34
  Requires-Dist: langchain>=0.3.4; extra == "llm"
34
35
  Requires-Dist: langchain_community==0.3.3; extra == "llm"
36
+ Requires-Dist: chromadb>=0.5.15; extra == "llm"
35
37
  Provides-Extra: ml
36
38
  Requires-Dist: scikit-learn==1.5.2; extra == "ml"
37
39
  Requires-Dist: tensorflow<2.18.0,>=2.17.0; extra == "ml"
@@ -43,17 +45,15 @@ Requires-Dist: scipy==1.13.1; extra == "ml"
43
45
  Requires-Dist: pandas>=2.2.0; extra == "ml"
44
46
  Requires-Dist: plotly==5.24.1; extra == "ml"
45
47
  Requires-Dist: matplotlib==3.8.0; extra == "ml"
46
- Requires-Dist: rdkit==2023.3.3; extra == "ml"
47
48
  Requires-Dist: datasets==3.0.2; extra == "ml"
48
49
  Requires-Dist: tqdm>=4.66.6; extra == "ml"
49
50
  Provides-Extra: datasets
50
51
  Requires-Dist: datasets==3.0.2; extra == "datasets"
51
52
  Requires-Dist: unstructured==0.16.3; extra == "datasets"
52
- Requires-Dist: py7zr==0.22.0; extra == "datasets"
53
53
  Requires-Dist: xmlschema==3.4.2; extra == "datasets"
54
54
  Provides-Extra: drugbank
55
55
  Requires-Dist: rdkit==2023.3.3; extra == "drugbank"
56
- Requires-Dist: openai>=1.52.2; extra == "drugbank"
56
+ Requires-Dist: xmlschema==3.4.2; extra == "drugbank"
57
57
  Provides-Extra: pipeline-and-ner
58
58
  Requires-Dist: nltk>=3.8.1; extra == "pipeline-and-ner"
59
59
  Requires-Dist: stanza==1.9.2; extra == "pipeline-and-ner"
@@ -61,16 +61,8 @@ Requires-Dist: transformers>=4.42.4; extra == "pipeline-and-ner"
61
61
  Requires-Dist: sentence-transformers<=3.3.1,>=3.0.1; extra == "pipeline-and-ner"
62
62
  Requires-Dist: mlflow==2.16.1; extra == "pipeline-and-ner"
63
63
  Provides-Extra: utils
64
- Requires-Dist: pydantic==2.10.6; extra == "utils"
65
64
  Requires-Dist: python-stopwatch==1.1.11; extra == "utils"
66
65
  Requires-Dist: importlib-resources==6.4.5; extra == "utils"
67
66
  Requires-Dist: lxml==5.3.0; extra == "utils"
68
67
  Requires-Dist: pyarrow==17.0.0; extra == "utils"
69
68
  Requires-Dist: pycryptodomex==3.22.0; extra == "utils"
70
- Requires-Dist: pydantic-settings-2.8.1; extra == "utils"
71
- Requires-Dist: python-dotenv-1.1.0; extra == "utils"
72
- Requires-Dist: python-iso639-2025.2.18; extra == "utils"
73
- Requires-Dist: python-magic-0.4.27; extra == "utils"
74
- Requires-Dist: pyzstd==0.16.2; extra == "utils"
75
- Requires-Dist: databricks-sdk-0.47.0; extra == "utils"
76
- Requires-Dist: python-tml-1.0.2; extra == "utils"
@@ -1,12 +1,9 @@
1
- ddi_fw/datasets/__init__.py,sha256=yDsRQD_9Ijpm_Rl2wSDwdutG5Q_wca_UBPEvm7nBx04,444
2
- ddi_fw/datasets/core.py,sha256=vRMpUsIHIbOKi-6TuUPNu1Ve3ny3cS9hdsydQxFCNvE,8078
1
+ ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
2
+ ddi_fw/datasets/core.py,sha256=gmasNdwohZ9Cd1qqhzijoTgX8VHQyzA0aBVtgjLQago,8344
3
3
  ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
4
4
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
5
- ddi_fw/datasets/embedding_generator.py,sha256=jiDKwLaPMaQkloxQkuCrhl-A-2OdvocmkSzjWtUnk4g,2255
6
- ddi_fw/datasets/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
7
- ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
8
5
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
9
- ddi_fw/datasets/ddi_mdl/base.py,sha256=yLxNzDYjLekq1qE6mKh6WkzUU5Xvn3JBEET9Ed-7b_E,9471
6
+ ddi_fw/datasets/ddi_mdl/base.py,sha256=bdcGmEbY_2Fe8fg0pKxfMuDopgaPUTUfQasCy8Bhcvc,9313
10
7
  ddi_fw/datasets/ddi_mdl/debug.log,sha256=eWz05j8RFqZuHFDTCF7Rck5w4rvtTanFN21iZsgxO7Y,115
11
8
  ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
12
9
  ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
@@ -72,7 +69,7 @@ ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9P
72
69
  ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
73
70
  ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
74
71
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
75
- ddi_fw/langchain/embeddings.py,sha256=XzIYgmqnAO93pnavKRDhYDoz0RhDn-RoC7CDc0yAvbM,7572
72
+ ddi_fw/langchain/embeddings.py,sha256=eEWy4okcjdhUJHi4N48Wd8XauPXyeaQVLUdNWEvtEcY,6754
76
73
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
77
74
  ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
78
75
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
@@ -87,7 +84,7 @@ ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
87
84
  ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
88
85
  ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
89
86
  ddi_fw/pipeline/multi_pipeline.py,sha256=NfcH4Ze5U-JRiH3lrxEDWj-VPxYQYtp7tq6bLCImBzs,5550
90
- ddi_fw/pipeline/ner_pipeline.py,sha256=q1aKjb54Ra1HzZ7dARvBw6lB37je9R-POEf2h6QT_nU,6018
87
+ ddi_fw/pipeline/ner_pipeline.py,sha256=kNGtkg5rNX5MDywzvRxmvyk-DxXAjEbYzZkp8pNlAZo,6023
91
88
  ddi_fw/pipeline/pipeline.py,sha256=70lYsluAnTWDLTlf6rbecffw3Bl34L1_6ALfLUoSvtY,11324
92
89
  ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
93
90
  ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
@@ -97,7 +94,10 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
97
94
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
98
95
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
99
96
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
100
- ddi_fw-0.0.157.dist-info/METADATA,sha256=BDNkvrnqN1gxAqUjpmyUqE-YGz86JZpVF0NM_q-oEJk,3612
101
- ddi_fw-0.0.157.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
102
- ddi_fw-0.0.157.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
103
- ddi_fw-0.0.157.dist-info/RECORD,,
97
+ ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
98
+ ddi_fw/vectorization/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
99
+ ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
100
+ ddi_fw-0.0.159.dist-info/METADATA,sha256=Ai8ONw45d5f5yEd3_SaNEKBew73TxPR0nrc75J4U0Ck,3145
101
+ ddi_fw-0.0.159.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
102
+ ddi_fw-0.0.159.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
103
+ ddi_fw-0.0.159.dist-info/RECORD,,
@@ -1,67 +0,0 @@
1
- # !pip install -U sentence-transformers
2
-
3
- # from transformers import BertTokenizer,BertForPreTraining,BertModel
4
- # from sentence_transformers import SentenceTransformer, util
5
- import pandas as pd
6
- import numpy as np
7
- import nltk
8
- from nltk import sent_tokenize
9
- from tqdm import tqdm
10
-
11
-
12
- nltk.download('punkt')
13
-
14
- import os
15
- def check_file_exists(path):
16
- return os.path.isdir(path)
17
-
18
- def get_model_name_or_local_path(model_local_path, model_name):
19
- if check_file_exists(model_local_path):
20
- return model_local_path
21
- return model_name
22
-
23
- import re
24
- def process_text(text):
25
- text = re.sub("\[L\d*\]", "",text)
26
- text = text.replace("[","")
27
- text = text.replace("]","")
28
- return text
29
-
30
-
31
-
32
- from collections import defaultdict
33
- from functools import partial
34
-
35
- # NOT modelden input size'ı anlama,
36
- def create_embeddings(model, data, column, drop_column=True):
37
- # model._modules['1'].get_sentence_embedding_dimension()
38
- # shape = (1,model._modules['0'].get_word_embedding_dimension())
39
- shape = model._modules['0'].get_word_embedding_dimension()
40
- column_embeddings_dict = defaultdict(lambda: np.zeros(shape))
41
- for index, row in tqdm(data.iterrows()):
42
- # if index == 10:
43
- # break
44
- text = data[column][index]
45
- # else'de zero
46
- if text == None or type(text) != str:
47
- embeddings = None
48
- else:
49
- sentences = sent_tokenize(text)
50
- embeddings = model.encode(sentences)
51
-
52
- #TODO benzer olan ilacın embedding değerini vererek dene
53
- if embeddings is None or len(embeddings) == 0: #embedding check none type
54
- sum_of_embeddings = np.zeros(shape)
55
- else:
56
- sum_of_embeddings = np.sum(embeddings, axis = 0)
57
- # column_embeddings_dict[row['id']] = sum_of_embeddings.reshape(1, -1) # 2d
58
- column_embeddings_dict[row['id']] = sum_of_embeddings
59
- # data.iloc[index][column+'_embedding']=sum_of_embeddings
60
-
61
- # data[column+'_embedding'] = pd.Series(column_embeddings_dict.values())
62
- data[column+'_embedding'] = pd.Series(list(column_embeddings_dict.values()))
63
- if(drop_column):
64
- data.drop([column], axis = 1, inplace = True)
65
- # data[column+'_embedding'] = [column_embeddings_dict[row['name']] for index, row in data.iterrows()]
66
- return column_embeddings_dict
67
-
File without changes