PyPI - radnn - Versions diffs - 0.0.8__tar.gz → 0.0.9__tar.gz - Mend

radnn 0.0.8tar.gz → 0.0.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

{radnn-0.0.8 → radnn-0.0.9}/PKG-INFO RENAMED Viewed

@@ -1,35 +1,13 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: radnn
-Version: 0.0.8
+Version: 0.0.9
 Summary: Rapid Deep Neural Networks
 Author-email: "Pantelis I. Kaplanoglou" <pikaplanoglou@ihu.gr>
-License: MIT License
-        Copyright (c) 2017-2025 Pantelis I. Kaplanoglou
-        Permission is hereby granted, free of charge, to any person obtaining a copy
-        of this software and associated documentation files (the "Software"), to deal
-        in the Software without restriction, including without limitation the rights
-        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-        copies of the Software, and to permit persons to whom the Software is
-        furnished to do so, subject to the following conditions:
-        The above copyright notice and this permission notice shall be included in all
-        copies or substantial portions of the Software.
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-        SOFTWARE.
+License-Expression: MIT
 Project-URL: Homepage, https://github.com/pikaplan/radnn
 Project-URL: Documentation, https://radnn.readthedocs.io/
 Classifier: Intended Audience :: Science/Research
 Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python
 Classifier: Topic :: Software Development
 Classifier: Topic :: Scientific/Engineering
@@ -47,6 +25,7 @@ Requires-Dist: numpy>=1.26.4
 Requires-Dist: matplotlib>=3.8.4
 Requires-Dist: pandas>=2.2.1
 Requires-Dist: scikit-learn>=1.4.2
+Dynamic: license-file
 # radnn - Rapid Deep Neural Networks

{radnn-0.0.8 → radnn-0.0.9}/pyproject.toml RENAMED Viewed

@@ -1,18 +1,17 @@
 [project]
 name = "radnn"
-version = "0.0.8"
+version = "0.0.9"
 description = "Rapid Deep Neural Networks"
 readme = "README.md"
 authors = [
     {name = "Pantelis I. Kaplanoglou", email = "pikaplanoglou@ihu.gr"}
 ]
 requires-python = ">=3.7"
-license = {file = "LICENSE.txt"}
+license = "MIT"
+license-files = ["LICENSE.txt"]
 classifiers=[
     "Intended Audience :: Science/Research",
     "Intended Audience :: Developers",
-    "License :: OSI Approved :: MIT License",
     "Programming Language :: Python",
     "Topic :: Software Development",
     "Topic :: Scientific/Engineering",

{radnn-0.0.8 → radnn-0.0.9}/src/radnn/__init__.py RENAMED Viewed

@@ -3,8 +3,9 @@
 # Version 0.0.6    [2025-02-04]
 # Version 0.0.7.2  [2025-02-17]
 # Version 0.0.7.3  [2025-02-21]
-# Version 0.0.8    [2025-02-xx]
-__version__ = "0.0.8"
+# Version 0.0.8    [2025-02-25]
+# Version 0.0.9    [2025-03-xx]
+__version__ = "0.0.9"
 from .system import FileStore, FileSystem
 from .ml_system import MLSystem

{radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/dataset_base.py RENAMED Viewed

@@ -236,7 +236,7 @@ class DataSetBase(object):
   def has_cache(self, samples_file_prefix="Samples"):
     return self.filestore.exists("%s.pkl" % samples_file_prefix) or self.filestore.exists("%s.TS.pkl" % samples_file_prefix)
   # --------------------------------------------------------------------------------------------------------------------
-  def load_cache(self, filestore: FileStore = None, samples_file_prefix="Samples", targets_file_prefix="Labels", is_verbose=False):
+  def load_cache(self, filestore: FileStore = None, samples_file_prefix="Samples", targets_file_prefix="Labels", ids_file_prefix="Ids", is_verbose=False):
     if filestore is None:
       filestore = self.filestore
     if filestore is None:
@@ -258,30 +258,39 @@ class DataSetBase(object):
       self.samples   = filestore.obj.load("%s.pkl" % samples_file_prefix)
       self.labels    = filestore.obj.load("%s.pkl" % targets_file_prefix)
       if is_verbose:
         print("Loading training set ...")
       nTSSamples  = filestore.obj.load("%s.TS.pkl" % samples_file_prefix)
       nTSTargets   = filestore.obj.load("%s.TS.pkl" % targets_file_prefix)
       self.assign_training_set(nTSSamples, nTSTargets)
+      nTSIDs = filestore.obj.load("%s.TS.pkl" % ids_file_prefix)
+      if nTSIDs is not None:
+        self.ts_sample_ids = nTSIDs
       if is_verbose:
         print("Loading validation set ...")
       nVSSamples  = filestore.obj.load("%s.VS.pkl" % samples_file_prefix)
       nVSTargets  = filestore.obj.load("%s.VS.pkl" % targets_file_prefix)
       self.assign_validation_set(nVSSamples, nVSTargets)
+      nVSIds = filestore.obj.load("%s.VS.pkl" % ids_file_prefix)
+      if nVSIds is not None:
+        self.vs_sample_ids = nVSIds
       if is_verbose:
         print("Loading unknown test data set ...")
       nUTSamples  = filestore.obj.load("%s.UT.pkl" % samples_file_prefix)
       if nUTSamples is not None:
         nUTTargets   = filestore.obj.load("%s.UT.pkl" % targets_file_prefix)
         self.assign_unknown_test_set(nUTSamples, nUTTargets)
+      nUTIds = filestore.obj.load("%s.UT.pkl" % ids_file_prefix)
+      if nUTIds is not None:
+        self.ut_sample_ids = nUTIds
     return bResult
   # --------------------------------------------------------------------------------------------------------------------
-  def save_cache(self, filestore: FileStore = None, samples_file_prefix="Samples", targets_file_prefix="Labels"):
+  def save_cache(self, filestore: FileStore = None, samples_file_prefix="Samples", targets_file_prefix="Labels", ids_file_prefix="Ids"):
     if filestore is None:
       filestore = self.filestore
     if filestore is None:
@@ -293,13 +302,16 @@ class DataSetBase(object):
     filestore.obj.save(self.ts_samples, "%s.TS.pkl" % samples_file_prefix, is_overwriting=True)
     filestore.obj.save(self.ts_labels, "%s.TS.pkl" % targets_file_prefix, is_overwriting=True)
+    filestore.obj.save(self.ts_sample_ids, "%s.TS.pkl" % ids_file_prefix, is_overwriting=True)
     filestore.obj.save(self.vs_samples, "%s.VS.pkl" % samples_file_prefix, is_overwriting=True)
     filestore.obj.save(self.vs_labels, "%s.VS.pkl" % targets_file_prefix, is_overwriting=True)
+    filestore.obj.save(self.vs_sample_ids, "%s.VS.pkl" % ids_file_prefix, is_overwriting=True)
     if self.ut_samples is not None:
       filestore.obj.save(self.ut_samples, "%s.UT.pkl" % samples_file_prefix, is_overwriting=True)
       filestore.obj.save(self.ut_labels, "%s.UT.pkl" % targets_file_prefix, is_overwriting=True)
+      filestore.obj.save(self.ut_sample_ids, "%s.UT.pkl" % ids_file_prefix, is_overwriting=True)
     self.card["name"] = self.name
     if self.feature_count is not None:

{radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/jsonfile.py RENAMED Viewed

@@ -32,6 +32,9 @@ import json
 import glob
 from .fileobject import FileObject
+#TODO: jsonpickle
+#https://stackoverflow.com/questions/3768895/how-to-make-a-class-json-serializable
 class JSONFile(FileObject):
   # ----------------------------------------------------------------------------------
   def __init__(self, filename, parent_folder=None, error_template=None):

{radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/textfile.py RENAMED Viewed

@@ -43,24 +43,24 @@ class TextFile(FileObject):
   def load(self, filename=None, encoding=None):
     filename = self._useFileName(filename)
-    oEncodingToTry = ["utf-8", "utf-16", "latin1", "ascii"]  # Add more if needed
     sText = None
-    if encoding is None:
-      bIsLoaded = False
-      for sEnc in oEncodingToTry:
-        try:
-          with open(filename, "r", encoding=sEnc) as oFile:
-            sText = oFile.read()
-          bIsLoaded = True
-          break
-        except (UnicodeDecodeError, UnicodeError):
-          continue
-      if not bIsLoaded:
-        raise ValueError("Unsupported encoding")
-    else:
-      with open(filename, "r", encoding=encoding) as oFile:
-        sText = oFile.read()
+    if os.path.isfile(filename):
+      oEncodingToTry = ["utf-8", "utf-16", "latin1", "ascii"]  # Add more if needed
+      if encoding is None:
+        bIsLoaded = False
+        for sEnc in oEncodingToTry:
+          try:
+            with open(filename, "r", encoding=sEnc) as oFile:
+              sText = oFile.read()
+            bIsLoaded = True
+            break
+          except (UnicodeDecodeError, UnicodeError):
+            continue
+        if not bIsLoaded:
+          raise ValueError("Unsupported encoding")
+      else:
+        with open(filename, "r", encoding=encoding) as oFile:
+          sText = oFile.read()
     return sText
   # --------------------------------------------------------------------------------------------------------------------
@@ -74,9 +74,6 @@ class TextFile(FileObject):
         p_sFileName        : Full path to the text file
         p_sText            : Text to write
     """
-    if (self.parent_folder is not None):
-      sFilename = os.path.join(self.parent_folder, sFilename)
     if self.is_verbose:
       print("  {.} Saving text to %s" % sFilename)

{radnn-0.0.8 → radnn-0.0.9}/src/radnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,35 +1,13 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: radnn
-Version: 0.0.8
+Version: 0.0.9
 Summary: Rapid Deep Neural Networks
 Author-email: "Pantelis I. Kaplanoglou" <pikaplanoglou@ihu.gr>
-License: MIT License
-        Copyright (c) 2017-2025 Pantelis I. Kaplanoglou
-        Permission is hereby granted, free of charge, to any person obtaining a copy
-        of this software and associated documentation files (the "Software"), to deal
-        in the Software without restriction, including without limitation the rights
-        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-        copies of the Software, and to permit persons to whom the Software is
-        furnished to do so, subject to the following conditions:
-        The above copyright notice and this permission notice shall be included in all
-        copies or substantial portions of the Software.
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-        SOFTWARE.
+License-Expression: MIT
 Project-URL: Homepage, https://github.com/pikaplan/radnn
 Project-URL: Documentation, https://radnn.readthedocs.io/
 Classifier: Intended Audience :: Science/Research
 Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python
 Classifier: Topic :: Software Development
 Classifier: Topic :: Scientific/Engineering
@@ -47,6 +25,7 @@ Requires-Dist: numpy>=1.26.4
 Requires-Dist: matplotlib>=3.8.4
 Requires-Dist: pandas>=2.2.1
 Requires-Dist: scikit-learn>=1.4.2
+Dynamic: license-file
 # radnn - Rapid Deep Neural Networks

{radnn-0.0.8 → radnn-0.0.9}/src/radnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -73,6 +73,8 @@ src/radnn/system/threads/thread_safe_queue.py
 src/radnn/system/threads/thread_safe_string_collection.py
 src/radnn/system/threads/thread_worker.py
 test/test_config.py
+test/test_corpus.py
+test/test_corpus_load.py
 test/test_data_feed.py
 test/test_dataset_base.py
 test/test_dataset_from_pandas.py
@@ -86,4 +88,5 @@ test/test_normalizer.py
 test/test_normalizer_div_zero.py
 test/test_sample_set.py
 test/test_standardizer.py
+test/test_text_pipeline.py
 test/test_train.py

radnn-0.0.9/test/test_corpus.py ADDED Viewed

@@ -0,0 +1,91 @@
+from radnn import mlsys, FileSystem
+from openpyxl import load_workbook
+from datasets import TextQuestionSample, CytaChatbotDataset
+from chatgpt import ChatGPTAPI
+import re
+from tqdm import tqdm
+def split_number(text):
+    match = re.match(r'^(\d+[.)])\s*(.*)', text)
+    if match:
+        number = match.group(1)  # The numeric part with ) or .
+        rest = match.group(2)    # The remaining text
+        return number, rest
+    else:
+        return None, text
+mlsys.filesys = FileSystem()
+oDataset = CytaChatbotDataset(mlsys.filesys.datasets.subfs("CYTACHATBOT"))
+API_KEY = "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"
+oAPI = ChatGPTAPI(API_KEY)
+for sLang in ["EL", "EN"]:
+    if not oDataset.load_question_answers(sLang):
+        sImportFileName = oDataset.fs.subfs("source").file(f"CytaChatbot_v2-{sLang}.xlsx")
+        # Load workbook and select a sheet
+        wb = load_workbook(sImportFileName)
+        sheet = wb.active  # Or wb['SheetName']
+        # Iterate over rows
+        bIsQuestion = True
+        nIndex = -1
+        for row in sheet.iter_rows(values_only=True):
+            assert row[1] is None, "More columns"
+            bIsQuestion = row[0] is not None
+            bIsAnswer = row[0]  is not None
+            if bIsQuestion:
+                nID, sQuestion = split_number(row[0].strip())
+                bIsQuestion =  nID is not None
+            if bIsQuestion:
+                nIndex += 1
+                oSample = TextQuestionSample()
+                oDataset.append(oSample)
+                sRow = row[0].strip()
+                nID = None
+                if (sRow[1] == ".") or (sRow[1] == ")"):
+                    nID = int(sRow[:1])
+                    sQuestion = sRow[2:]
+                if (sRow[2] == ".") or (sRow[2] == ")"):
+                    nID = int(sRow[:2])
+                    sQuestion = sRow[3:]
+                if (sRow[3] == ".") or (sRow[3] == ")"):
+                    nID = int(sRow[:3])
+                    sQuestion = sRow[4:]
+                oSample.id = nID
+                oSample.question = sQuestion.strip()
+                oSample.answer = ""
+                bIsQuestion = False
+            elif bIsAnswer:
+                sRow = row[0].strip()
+                if oSample.answer == "":
+                    oSample.answer += "\n" + sRow
+                else:
+                    oSample.answer = sRow
+                print(oSample)
+        oDataset.save_question_answers(sLang)
+    EXTRA_QUESTIONS_COUNT = 30
+    for nIndex, oSample in tqdm(enumerate(oDataset)):
+        try:
+          if sLang == "EL":
+            prompt = f"Γράψε {EXTRA_QUESTIONS_COUNT} παραλλαγές της ερώτησης: '{oSample.question}', στην απάντηση βάλε μόνο τις παραλλαγές"
+          else:
+            prompt = f"Write {EXTRA_QUESTIONS_COUNT} variations of the question: '{oSample.question}', keep only the variations in the answer"
+          response = oAPI.generate(prompt)
+          oSample.question_alt = response.split('\n')
+          print(oSample)
+          print(oSample.question_alt)
+        except Exception as e:
+          new_questions = f"Error: {e}"
+        oDataset.save_question_answers_aug(sLang)

radnn-0.0.9/test/test_corpus_load.py ADDED Viewed

@@ -0,0 +1,209 @@
+import random
+from radnn import mlsys, FileSystem
+from openpyxl import load_workbook
+from datasets import TextQuestionSample, CytaChatbotDataset, StratifiedCytaChatbotDataset
+from chatgpt import ChatGPTAPI
+import re
+from tqdm import tqdm
+import ast
+import numpy as np
+LANG = "EL"
+def split_alt_id_question(question):
+    result = []
+    match = re.match(r"(\d+)\.\s*(.+)", question)
+    if match:
+        result.append(match.group(1))
+        result.append(match.group(2))
+    return result
+API_KEY = "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"
+oAPI = ChatGPTAPI(API_KEY)
+mlsys.filesys = FileSystem()
+oDataset = CytaChatbotDataset(mlsys.filesys.datasets.subfs("CYTACHATBOT"))
+if not oDataset.load_question_answers_aug(LANG):
+    print("[>] Attaching alternative questions ...")
+    oDataset.load_question_answers(LANG)
+    # Calculate the difference between consecutive elements
+    oIds = np.asarray([oSample.id for oSample in oDataset], dtype=np.int32)
+    diffs = np.diff(oIds)
+    # Find indices where the difference is more than 1 (indicates a gap)
+    gap_indices = np.where(diffs > 1)[0]
+    if len(gap_indices) > 0:
+        print("[x] Gaps in sample numbering:")
+        # Report the missing numbers
+        for index in gap_indices:
+            start = oIds[index] + 1
+            end = oIds[index + 1] - 1
+            missing = list(range(start, end + 1))
+            print(f"|__ Missing: {missing}")
+    if oDataset.fs.subfs("source").exists(f"CytaChatbot_v1_Targets-{LANG}.xlsx"):
+        print("|__ Attaching annotations")
+        sImportFileName = oDataset.fs.subfs("source").file(f"CytaChatbot_v1_Targets-{LANG}.xlsx")
+        oWorkbook = load_workbook(sImportFileName)
+        oSheet = oWorkbook.active  # Or wb['SheetName']
+        oTargets = []
+        for nIndex, oRow in enumerate(oSheet.iter_rows(values_only=True)):
+            if (nIndex > 0) and (oRow[0] is not None):
+                try:
+                    nOnehot = np.asarray(oRow[2:5], dtype=np.int32)
+                except Exception as e:
+                    print(nIndex, oRow)
+                    raise
+                assert nOnehot.sum() == 1, "More than one tags"
+                nID = nIndex - 1
+                sID = oRow[0]
+                sSample = oRow[1].strip().replace("?", ";")
+                sSampleV2 = oDataset[nID].question.strip().replace("?", ";")
+                nTarget = np.argmax(nOnehot)
+                oTargets.append(nTarget)
+                oDataset[nID].annotations = nTarget
+                #if sSample != sSampleV2:
+                #    print(nSampleID, sSample, "!=", sSampleV2, nTarget)
+                #else:
+                    #print(nSampleID, sSample, nTarget)
+    else:
+        # TODO: EN Annotations
+        random.seed(2025)
+        oTargets = []
+        for nIndex, oTextRecord in enumerate(oDataset):
+            nTarget = random.randint(0, 2)
+            oTargets.append(nTarget)
+            oTextRecord.annotations = nTarget
+    print("|__ Adding augmentations")
+    if False:
+        EXTRA_QUESTIONS_COUNT = 30
+        for nIndex, oTextRecord in enumerate(tqdm(oDataset)):
+            try:
+              if LANG == "EL":
+                prompt = f"Γράψε {EXTRA_QUESTIONS_COUNT} παραλλαγές της ερώτησης: '{oTextRecord.question}', στην απάντηση βάλε μόνο τις παραλλαγές"
+              else:
+                prompt = f"Write {EXTRA_QUESTIONS_COUNT} variations of the question: '{oTextRecord.question}', keep only the variations in the answer"
+              response = oAPI.generate(prompt)
+              oTextRecord.question_alt = response.split('\n')
+              print(oTextRecord)
+              print(oTextRecord.question_alt)
+            except Exception as e:
+              new_questions = f"Error: {e}"
+            oDataset.save_question_answers_aug(LANG)
+    else:
+        mlsys.filesys = FileSystem()
+        oFS = mlsys.filesys.datasets.subfs("CYTACHATBOT")
+        sFileContents = oFS.text.load(f"Augmented_{LANG}.txt")
+        oFilteredLines = []
+        for sLine in sFileContents.splitlines():
+            if sLine.startswith("['") or sLine.startswith('["'):
+                oFilteredLines.append(sLine)
+        nSampleIndex = 0
+        for sLine in oFilteredLines:
+            oList = ast.literal_eval(sLine)
+            oListClean = []
+            for x in oList:
+                oParts = split_alt_id_question(x)
+                if len(oParts) > 1:
+                    oListClean.append(oParts[1].strip().replace("?", ";"))
+            #print(sID, oListClean)
+            assert len(oListClean) == 30, "Wrong count of alternative questions"
+            oDataset[nSampleIndex].question_alt = oListClean
+            oDataset[nSampleIndex].question = oDataset[nSampleIndex].question.strip().replace("?", ";")
+            nSampleIndex += 1
+    oNewDataSet = CytaChatbotDataset(mlsys.filesys.datasets.subfs("CYTACHATBOT"))
+    for oSample in oDataset:
+        oNewDataSet.append(oSample)
+    oNewDataSet.save_question_answers_aug(LANG)
+else:
+    oNewDataSet = oDataset
+    oTargets = [oSample.annotations for oSample in oDataset]
+nClassHistogram, bin_edges = np.histogram(oTargets, bins=3)
+def StratifiedBalancing(dataset_augmented, class_histogram=[89, 157, 358]):
+    nClassHistogram = class_histogram
+    nClasses = len(nClassHistogram)
+    nExtraSamples = 30
+    nMinorityClassCount = np.min(nClassHistogram)
+    nMaxSamples = nMinorityClassCount*(nExtraSamples + 1)
+    nTarget = (nMaxSamples // 50) * 50
+    nMaxSamplesPerClass   = np.zeros(nClasses, np.int32)
+    nTargetClassHistogram = np.zeros(nClasses, np.int32)
+    nExtraSamplesPerClass = np.zeros(nClasses, np.int32)
+    nMinusOneSamplesCount = np.zeros(nClasses, np.int32)
+    for nIndex, nOriginalCount in enumerate(nClassHistogram):
+        if nOriginalCount != nMinorityClassCount:
+            nExtraSamplesCeil = int(np.ceil(nTarget / nOriginalCount)) - 1
+            nMaxSamplesPerClass[nIndex]   = nOriginalCount * (nExtraSamplesCeil + 1)
+            nExtraSamplesPerClass[nIndex] = nExtraSamplesCeil
+            nTargetClassHistogram[nIndex] = nTarget
+            nMinusOneSamplesCount[nIndex] = nTarget - nMaxSamplesPerClass[nIndex]
+        else:
+            nMaxSamplesPerClass[nIndex] = nMaxSamples
+            nExtraSamplesPerClass[nIndex] = nExtraSamples
+            nTargetClassHistogram[nIndex] = nTarget
+            nMinusOneSamplesCount[nIndex] = nTarget - nMaxSamples
+    print(nClassHistogram)
+    print(nExtraSamplesPerClass)
+    print(nMaxSamplesPerClass)
+    print(nMinusOneSamplesCount)
+    print(nTargetClassHistogram)
+    nMinusOneLimit = nClassHistogram + nMinusOneSamplesCount
+    nClassOccurences = np.zeros(nClasses, np.int32)
+    oStratifiedDS = StratifiedCytaChatbotDataset(dataset_augmented.fs)
+    for oSample in dataset_augmented:
+        sAnswer = oSample.answer
+        nClassIndex = oSample.annotations
+        nClassOccurences[nClassIndex] += 1
+        nExtraSamples = nExtraSamplesPerClass[nClassIndex]
+        if nClassOccurences[nClassIndex] > nMinusOneLimit[nClassIndex]:
+            nExtraSamples -= 1
+        nBaseNewID = 1000000 + (oSample.id*1000)
+        oNewSample = TextQuestionSample(nBaseNewID, oSample.question, sAnswer, nClassIndex)
+        oQuestionSamples = list()
+        oStratifiedDS.class_questions[nClassIndex][oSample.id] = oQuestionSamples
+        oQuestionSamples.append(oNewSample)
+        for nExtraSampleIndex in range(nExtraSamples):
+            oNewSample = TextQuestionSample(nBaseNewID + 1 + nExtraSampleIndex, oSample.question_alt[nExtraSampleIndex], sAnswer, nClassIndex)
+            oQuestionSamples.append(oNewSample)
+    return oStratifiedDS
+oStratifiedDS = StratifiedBalancing(oNewDataSet, nClassHistogram)
+oStratifiedDS.save_questions(LANG)
+if False:
+    for k, v in oStratifiedDS.class_questions[2].items():
+        for s in v:
+            print(s.question)

radnn-0.0.9/test/test_text_pipeline.py ADDED Viewed

@@ -0,0 +1,17 @@
+from radnn import mlsys, FileSystem
+from datasets import StratifiedCytaChatbotDataset
+mlsys.filesys = FileSystem()
+oDataset = StratifiedCytaChatbotDataset(mlsys.filesys.datasets.subfs("CYTACHATBOT"))
+if not oDataset.load_cache():
+  if oDataset.load_questions("EL"):
+    oDataset.split()
+    oDataset.print_info()
+    oDataset.save_cache()
+for nIndex in range(oDataset.ts_sample_count):
+  print(f"{oDataset.ts_sample_ids[nIndex]}§{oDataset.ts_labels[nIndex]}§{oDataset.ts_samples[nIndex]}")
+print("="*80)
+for nIndex in range(oDataset.vs_sample_count):
+  print(f"{oDataset.vs_sample_ids[nIndex]}§{oDataset.vs_labels[nIndex]}§{oDataset.vs_samples[nIndex]}")