radnn 0.0.8__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {radnn-0.0.8 → radnn-0.0.9}/PKG-INFO +4 -25
  2. {radnn-0.0.8 → radnn-0.0.9}/pyproject.toml +3 -4
  3. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/__init__.py +3 -2
  4. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/dataset_base.py +17 -5
  5. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/jsonfile.py +3 -0
  6. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/textfile.py +17 -20
  7. {radnn-0.0.8 → radnn-0.0.9}/src/radnn.egg-info/PKG-INFO +4 -25
  8. {radnn-0.0.8 → radnn-0.0.9}/src/radnn.egg-info/SOURCES.txt +3 -0
  9. radnn-0.0.9/test/test_corpus.py +91 -0
  10. radnn-0.0.9/test/test_corpus_load.py +209 -0
  11. radnn-0.0.9/test/test_text_pipeline.py +17 -0
  12. {radnn-0.0.8 → radnn-0.0.9}/LICENSE.txt +0 -0
  13. {radnn-0.0.8 → radnn-0.0.9}/README.md +0 -0
  14. {radnn-0.0.8 → radnn-0.0.9}/setup.cfg +0 -0
  15. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/core.py +0 -0
  16. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/__init__.py +0 -0
  17. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/data_feed.py +0 -0
  18. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/dataset_folder.py +0 -0
  19. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/image_dataset.py +0 -0
  20. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/image_dataset_files.py +0 -0
  21. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/preprocess/__init__.py +0 -0
  22. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/preprocess/normalizer.py +0 -0
  23. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/preprocess/standardizer.py +0 -0
  24. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/sample_set.py +0 -0
  25. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/sequence_dataset.py +0 -0
  26. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/subset_type.py +0 -0
  27. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/data/tf_classification_data_feed.py +0 -0
  28. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/errors.py +0 -0
  29. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/evaluation/__init__.py +0 -0
  30. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/evaluation/evaluate_classification.py +0 -0
  31. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/experiment/__init__.py +0 -0
  32. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/experiment/ml_experiment.py +0 -0
  33. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/experiment/ml_experiment_config.py +0 -0
  34. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/experiment/ml_experiment_env.py +0 -0
  35. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/experiment/ml_experiment_store.py +0 -0
  36. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/images/__init__.py +0 -0
  37. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/images/colors.py +0 -0
  38. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/images/image_processor.py +0 -0
  39. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/learn/__init__.py +0 -0
  40. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/learn/keras_learning_rate_scheduler.py +0 -0
  41. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/learn/keras_optimization_algorithm.py +0 -0
  42. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/learn/learning_algorithm.py +0 -0
  43. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/learn/state/__init__.py +0 -0
  44. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/learn/state/keras_best_state_saver.py +0 -0
  45. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/ml_system.py +0 -0
  46. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/plots/__init__.py +0 -0
  47. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/plots/plot_auto_multi_image.py +0 -0
  48. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/plots/plot_confusion_matrix.py +0 -0
  49. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/plots/plot_learning_curve.py +0 -0
  50. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/plots/plot_multi_scatter.py +0 -0
  51. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/plots/plot_roc.py +0 -0
  52. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/plots/plot_voronoi_2d.py +0 -0
  53. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/stats/__init__.py +0 -0
  54. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/stats/descriptive_stats.py +0 -0
  55. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/__init__.py +0 -0
  56. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/__init__.py +0 -0
  57. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/csvfile.py +0 -0
  58. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/filelist.py +0 -0
  59. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/fileobject.py +0 -0
  60. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/imgfile.py +0 -0
  61. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/files/picklefile.py +0 -0
  62. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/filestore.py +0 -0
  63. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/filesystem.py +0 -0
  64. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/hosts/__init__.py +0 -0
  65. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/hosts/colab_host.py +0 -0
  66. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/hosts/linux_host.py +0 -0
  67. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/hosts/windows_host.py +0 -0
  68. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/tee_logger.py +0 -0
  69. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/threads/__init__.py +0 -0
  70. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/threads/semaphore_lock.py +0 -0
  71. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/threads/thread_context.py +0 -0
  72. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/threads/thread_safe_queue.py +0 -0
  73. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/threads/thread_safe_string_collection.py +0 -0
  74. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/system/threads/thread_worker.py +0 -0
  75. {radnn-0.0.8 → radnn-0.0.9}/src/radnn/utils.py +0 -0
  76. {radnn-0.0.8 → radnn-0.0.9}/src/radnn.egg-info/dependency_links.txt +0 -0
  77. {radnn-0.0.8 → radnn-0.0.9}/src/radnn.egg-info/requires.txt +0 -0
  78. {radnn-0.0.8 → radnn-0.0.9}/src/radnn.egg-info/top_level.txt +0 -0
  79. {radnn-0.0.8 → radnn-0.0.9}/test/test_config.py +0 -0
  80. {radnn-0.0.8 → radnn-0.0.9}/test/test_data_feed.py +0 -0
  81. {radnn-0.0.8 → radnn-0.0.9}/test/test_dataset_base.py +0 -0
  82. {radnn-0.0.8 → radnn-0.0.9}/test/test_dataset_from_pandas.py +0 -0
  83. {radnn-0.0.8 → radnn-0.0.9}/test/test_experiment_env.py +0 -0
  84. {radnn-0.0.8 → radnn-0.0.9}/test/test_image_dataset_from_files.py +0 -0
  85. {radnn-0.0.8 → radnn-0.0.9}/test/test_ml_experiment_config.py +0 -0
  86. {radnn-0.0.8 → radnn-0.0.9}/test/test_ml_system.py +0 -0
  87. {radnn-0.0.8 → radnn-0.0.9}/test/test_mlsystem.py +0 -0
  88. {radnn-0.0.8 → radnn-0.0.9}/test/test_mnist.py +0 -0
  89. {radnn-0.0.8 → radnn-0.0.9}/test/test_normalizer.py +0 -0
  90. {radnn-0.0.8 → radnn-0.0.9}/test/test_normalizer_div_zero.py +0 -0
  91. {radnn-0.0.8 → radnn-0.0.9}/test/test_sample_set.py +0 -0
  92. {radnn-0.0.8 → radnn-0.0.9}/test/test_standardizer.py +0 -0
  93. {radnn-0.0.8 → radnn-0.0.9}/test/test_train.py +0 -0
@@ -1,35 +1,13 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: radnn
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: Rapid Deep Neural Networks
5
5
  Author-email: "Pantelis I. Kaplanoglou" <pikaplanoglou@ihu.gr>
6
- License: MIT License
7
-
8
- Copyright (c) 2017-2025 Pantelis I. Kaplanoglou
9
-
10
- Permission is hereby granted, free of charge, to any person obtaining a copy
11
- of this software and associated documentation files (the "Software"), to deal
12
- in the Software without restriction, including without limitation the rights
13
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
- copies of the Software, and to permit persons to whom the Software is
15
- furnished to do so, subject to the following conditions:
16
-
17
- The above copyright notice and this permission notice shall be included in all
18
- copies or substantial portions of the Software.
19
-
20
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
- SOFTWARE.
27
-
6
+ License-Expression: MIT
28
7
  Project-URL: Homepage, https://github.com/pikaplan/radnn
29
8
  Project-URL: Documentation, https://radnn.readthedocs.io/
30
9
  Classifier: Intended Audience :: Science/Research
31
10
  Classifier: Intended Audience :: Developers
32
- Classifier: License :: OSI Approved :: MIT License
33
11
  Classifier: Programming Language :: Python
34
12
  Classifier: Topic :: Software Development
35
13
  Classifier: Topic :: Scientific/Engineering
@@ -47,6 +25,7 @@ Requires-Dist: numpy>=1.26.4
47
25
  Requires-Dist: matplotlib>=3.8.4
48
26
  Requires-Dist: pandas>=2.2.1
49
27
  Requires-Dist: scikit-learn>=1.4.2
28
+ Dynamic: license-file
50
29
 
51
30
  # radnn - Rapid Deep Neural Networks
52
31
 
@@ -1,18 +1,17 @@
1
1
  [project]
2
2
  name = "radnn"
3
- version = "0.0.8"
3
+ version = "0.0.9"
4
4
  description = "Rapid Deep Neural Networks"
5
5
  readme = "README.md"
6
6
  authors = [
7
7
  {name = "Pantelis I. Kaplanoglou", email = "pikaplanoglou@ihu.gr"}
8
8
  ]
9
-
10
9
  requires-python = ">=3.7"
11
- license = {file = "LICENSE.txt"}
10
+ license = "MIT"
11
+ license-files = ["LICENSE.txt"]
12
12
  classifiers=[
13
13
  "Intended Audience :: Science/Research",
14
14
  "Intended Audience :: Developers",
15
- "License :: OSI Approved :: MIT License",
16
15
  "Programming Language :: Python",
17
16
  "Topic :: Software Development",
18
17
  "Topic :: Scientific/Engineering",
@@ -3,8 +3,9 @@
3
3
  # Version 0.0.6 [2025-02-04]
4
4
  # Version 0.0.7.2 [2025-02-17]
5
5
  # Version 0.0.7.3 [2025-02-21]
6
- # Version 0.0.8 [2025-02-xx]
7
- __version__ = "0.0.8"
6
+ # Version 0.0.8 [2025-02-25]
7
+ # Version 0.0.9 [2025-03-xx]
8
+ __version__ = "0.0.9"
8
9
 
9
10
  from .system import FileStore, FileSystem
10
11
  from .ml_system import MLSystem
@@ -236,7 +236,7 @@ class DataSetBase(object):
236
236
  def has_cache(self, samples_file_prefix="Samples"):
237
237
  return self.filestore.exists("%s.pkl" % samples_file_prefix) or self.filestore.exists("%s.TS.pkl" % samples_file_prefix)
238
238
  # --------------------------------------------------------------------------------------------------------------------
239
- def load_cache(self, filestore: FileStore = None, samples_file_prefix="Samples", targets_file_prefix="Labels", is_verbose=False):
239
+ def load_cache(self, filestore: FileStore = None, samples_file_prefix="Samples", targets_file_prefix="Labels", ids_file_prefix="Ids", is_verbose=False):
240
240
  if filestore is None:
241
241
  filestore = self.filestore
242
242
  if filestore is None:
@@ -258,30 +258,39 @@ class DataSetBase(object):
258
258
 
259
259
  self.samples = filestore.obj.load("%s.pkl" % samples_file_prefix)
260
260
  self.labels = filestore.obj.load("%s.pkl" % targets_file_prefix)
261
-
261
+
262
262
  if is_verbose:
263
263
  print("Loading training set ...")
264
264
  nTSSamples = filestore.obj.load("%s.TS.pkl" % samples_file_prefix)
265
265
  nTSTargets = filestore.obj.load("%s.TS.pkl" % targets_file_prefix)
266
266
  self.assign_training_set(nTSSamples, nTSTargets)
267
-
267
+ nTSIDs = filestore.obj.load("%s.TS.pkl" % ids_file_prefix)
268
+ if nTSIDs is not None:
269
+ self.ts_sample_ids = nTSIDs
270
+
268
271
  if is_verbose:
269
272
  print("Loading validation set ...")
270
273
  nVSSamples = filestore.obj.load("%s.VS.pkl" % samples_file_prefix)
271
274
  nVSTargets = filestore.obj.load("%s.VS.pkl" % targets_file_prefix)
272
275
  self.assign_validation_set(nVSSamples, nVSTargets)
273
-
276
+ nVSIds = filestore.obj.load("%s.VS.pkl" % ids_file_prefix)
277
+ if nVSIds is not None:
278
+ self.vs_sample_ids = nVSIds
279
+
274
280
  if is_verbose:
275
281
  print("Loading unknown test data set ...")
276
282
  nUTSamples = filestore.obj.load("%s.UT.pkl" % samples_file_prefix)
277
283
  if nUTSamples is not None:
278
284
  nUTTargets = filestore.obj.load("%s.UT.pkl" % targets_file_prefix)
279
285
  self.assign_unknown_test_set(nUTSamples, nUTTargets)
286
+ nUTIds = filestore.obj.load("%s.UT.pkl" % ids_file_prefix)
287
+ if nUTIds is not None:
288
+ self.ut_sample_ids = nUTIds
280
289
 
281
290
 
282
291
  return bResult
283
292
  # --------------------------------------------------------------------------------------------------------------------
284
- def save_cache(self, filestore: FileStore = None, samples_file_prefix="Samples", targets_file_prefix="Labels"):
293
+ def save_cache(self, filestore: FileStore = None, samples_file_prefix="Samples", targets_file_prefix="Labels", ids_file_prefix="Ids"):
285
294
  if filestore is None:
286
295
  filestore = self.filestore
287
296
  if filestore is None:
@@ -293,13 +302,16 @@ class DataSetBase(object):
293
302
 
294
303
  filestore.obj.save(self.ts_samples, "%s.TS.pkl" % samples_file_prefix, is_overwriting=True)
295
304
  filestore.obj.save(self.ts_labels, "%s.TS.pkl" % targets_file_prefix, is_overwriting=True)
305
+ filestore.obj.save(self.ts_sample_ids, "%s.TS.pkl" % ids_file_prefix, is_overwriting=True)
296
306
 
297
307
  filestore.obj.save(self.vs_samples, "%s.VS.pkl" % samples_file_prefix, is_overwriting=True)
298
308
  filestore.obj.save(self.vs_labels, "%s.VS.pkl" % targets_file_prefix, is_overwriting=True)
309
+ filestore.obj.save(self.vs_sample_ids, "%s.VS.pkl" % ids_file_prefix, is_overwriting=True)
299
310
 
300
311
  if self.ut_samples is not None:
301
312
  filestore.obj.save(self.ut_samples, "%s.UT.pkl" % samples_file_prefix, is_overwriting=True)
302
313
  filestore.obj.save(self.ut_labels, "%s.UT.pkl" % targets_file_prefix, is_overwriting=True)
314
+ filestore.obj.save(self.ut_sample_ids, "%s.UT.pkl" % ids_file_prefix, is_overwriting=True)
303
315
 
304
316
  self.card["name"] = self.name
305
317
  if self.feature_count is not None:
@@ -32,6 +32,9 @@ import json
32
32
  import glob
33
33
  from .fileobject import FileObject
34
34
 
35
+ #TODO: jsonpickle
36
+ #https://stackoverflow.com/questions/3768895/how-to-make-a-class-json-serializable
37
+
35
38
  class JSONFile(FileObject):
36
39
  # ----------------------------------------------------------------------------------
37
40
  def __init__(self, filename, parent_folder=None, error_template=None):
@@ -43,24 +43,24 @@ class TextFile(FileObject):
43
43
  def load(self, filename=None, encoding=None):
44
44
  filename = self._useFileName(filename)
45
45
 
46
- oEncodingToTry = ["utf-8", "utf-16", "latin1", "ascii"] # Add more if needed
47
-
48
46
  sText = None
49
- if encoding is None:
50
- bIsLoaded = False
51
- for sEnc in oEncodingToTry:
52
- try:
53
- with open(filename, "r", encoding=sEnc) as oFile:
54
- sText = oFile.read()
55
- bIsLoaded = True
56
- break
57
- except (UnicodeDecodeError, UnicodeError):
58
- continue
59
- if not bIsLoaded:
60
- raise ValueError("Unsupported encoding")
61
- else:
62
- with open(filename, "r", encoding=encoding) as oFile:
63
- sText = oFile.read()
47
+ if os.path.isfile(filename):
48
+ oEncodingToTry = ["utf-8", "utf-16", "latin1", "ascii"] # Add more if needed
49
+ if encoding is None:
50
+ bIsLoaded = False
51
+ for sEnc in oEncodingToTry:
52
+ try:
53
+ with open(filename, "r", encoding=sEnc) as oFile:
54
+ sText = oFile.read()
55
+ bIsLoaded = True
56
+ break
57
+ except (UnicodeDecodeError, UnicodeError):
58
+ continue
59
+ if not bIsLoaded:
60
+ raise ValueError("Unsupported encoding")
61
+ else:
62
+ with open(filename, "r", encoding=encoding) as oFile:
63
+ sText = oFile.read()
64
64
 
65
65
  return sText
66
66
  # --------------------------------------------------------------------------------------------------------------------
@@ -74,9 +74,6 @@ class TextFile(FileObject):
74
74
  p_sFileName : Full path to the text file
75
75
  p_sText : Text to write
76
76
  """
77
- if (self.parent_folder is not None):
78
- sFilename = os.path.join(self.parent_folder, sFilename)
79
-
80
77
  if self.is_verbose:
81
78
  print(" {.} Saving text to %s" % sFilename)
82
79
 
@@ -1,35 +1,13 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: radnn
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: Rapid Deep Neural Networks
5
5
  Author-email: "Pantelis I. Kaplanoglou" <pikaplanoglou@ihu.gr>
6
- License: MIT License
7
-
8
- Copyright (c) 2017-2025 Pantelis I. Kaplanoglou
9
-
10
- Permission is hereby granted, free of charge, to any person obtaining a copy
11
- of this software and associated documentation files (the "Software"), to deal
12
- in the Software without restriction, including without limitation the rights
13
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
- copies of the Software, and to permit persons to whom the Software is
15
- furnished to do so, subject to the following conditions:
16
-
17
- The above copyright notice and this permission notice shall be included in all
18
- copies or substantial portions of the Software.
19
-
20
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
- SOFTWARE.
27
-
6
+ License-Expression: MIT
28
7
  Project-URL: Homepage, https://github.com/pikaplan/radnn
29
8
  Project-URL: Documentation, https://radnn.readthedocs.io/
30
9
  Classifier: Intended Audience :: Science/Research
31
10
  Classifier: Intended Audience :: Developers
32
- Classifier: License :: OSI Approved :: MIT License
33
11
  Classifier: Programming Language :: Python
34
12
  Classifier: Topic :: Software Development
35
13
  Classifier: Topic :: Scientific/Engineering
@@ -47,6 +25,7 @@ Requires-Dist: numpy>=1.26.4
47
25
  Requires-Dist: matplotlib>=3.8.4
48
26
  Requires-Dist: pandas>=2.2.1
49
27
  Requires-Dist: scikit-learn>=1.4.2
28
+ Dynamic: license-file
50
29
 
51
30
  # radnn - Rapid Deep Neural Networks
52
31
 
@@ -73,6 +73,8 @@ src/radnn/system/threads/thread_safe_queue.py
73
73
  src/radnn/system/threads/thread_safe_string_collection.py
74
74
  src/radnn/system/threads/thread_worker.py
75
75
  test/test_config.py
76
+ test/test_corpus.py
77
+ test/test_corpus_load.py
76
78
  test/test_data_feed.py
77
79
  test/test_dataset_base.py
78
80
  test/test_dataset_from_pandas.py
@@ -86,4 +88,5 @@ test/test_normalizer.py
86
88
  test/test_normalizer_div_zero.py
87
89
  test/test_sample_set.py
88
90
  test/test_standardizer.py
91
+ test/test_text_pipeline.py
89
92
  test/test_train.py
@@ -0,0 +1,91 @@
1
+ from radnn import mlsys, FileSystem
2
+ from openpyxl import load_workbook
3
+ from datasets import TextQuestionSample, CytaChatbotDataset
4
+ from chatgpt import ChatGPTAPI
5
+ import re
6
+ from tqdm import tqdm
7
+
8
+ def split_number(text):
9
+ match = re.match(r'^(\d+[.)])\s*(.*)', text)
10
+ if match:
11
+ number = match.group(1) # The numeric part with ) or .
12
+ rest = match.group(2) # The remaining text
13
+ return number, rest
14
+ else:
15
+ return None, text
16
+
17
+ mlsys.filesys = FileSystem()
18
+ oDataset = CytaChatbotDataset(mlsys.filesys.datasets.subfs("CYTACHATBOT"))
19
+
20
+
21
+ API_KEY = "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"
22
+ oAPI = ChatGPTAPI(API_KEY)
23
+
24
+ for sLang in ["EL", "EN"]:
25
+ if not oDataset.load_question_answers(sLang):
26
+ sImportFileName = oDataset.fs.subfs("source").file(f"CytaChatbot_v2-{sLang}.xlsx")
27
+ # Load workbook and select a sheet
28
+ wb = load_workbook(sImportFileName)
29
+ sheet = wb.active # Or wb['SheetName']
30
+
31
+ # Iterate over rows
32
+
33
+ bIsQuestion = True
34
+ nIndex = -1
35
+ for row in sheet.iter_rows(values_only=True):
36
+ assert row[1] is None, "More columns"
37
+ bIsQuestion = row[0] is not None
38
+ bIsAnswer = row[0] is not None
39
+ if bIsQuestion:
40
+ nID, sQuestion = split_number(row[0].strip())
41
+ bIsQuestion = nID is not None
42
+ if bIsQuestion:
43
+ nIndex += 1
44
+ oSample = TextQuestionSample()
45
+ oDataset.append(oSample)
46
+
47
+ sRow = row[0].strip()
48
+ nID = None
49
+ if (sRow[1] == ".") or (sRow[1] == ")"):
50
+ nID = int(sRow[:1])
51
+ sQuestion = sRow[2:]
52
+ if (sRow[2] == ".") or (sRow[2] == ")"):
53
+ nID = int(sRow[:2])
54
+ sQuestion = sRow[3:]
55
+ if (sRow[3] == ".") or (sRow[3] == ")"):
56
+ nID = int(sRow[:3])
57
+ sQuestion = sRow[4:]
58
+ oSample.id = nID
59
+
60
+ oSample.question = sQuestion.strip()
61
+ oSample.answer = ""
62
+ bIsQuestion = False
63
+ elif bIsAnswer:
64
+ sRow = row[0].strip()
65
+ if oSample.answer == "":
66
+ oSample.answer += "\n" + sRow
67
+ else:
68
+ oSample.answer = sRow
69
+
70
+ print(oSample)
71
+
72
+
73
+ oDataset.save_question_answers(sLang)
74
+
75
+ EXTRA_QUESTIONS_COUNT = 30
76
+ for nIndex, oSample in tqdm(enumerate(oDataset)):
77
+ try:
78
+ if sLang == "EL":
79
+ prompt = f"Γράψε {EXTRA_QUESTIONS_COUNT} παραλλαγές της ερώτησης: '{oSample.question}', στην απάντηση βάλε μόνο τις παραλλαγές"
80
+ else:
81
+ prompt = f"Write {EXTRA_QUESTIONS_COUNT} variations of the question: '{oSample.question}', keep only the variations in the answer"
82
+ response = oAPI.generate(prompt)
83
+ oSample.question_alt = response.split('\n')
84
+
85
+ print(oSample)
86
+ print(oSample.question_alt)
87
+ except Exception as e:
88
+ new_questions = f"Error: {e}"
89
+
90
+ oDataset.save_question_answers_aug(sLang)
91
+
@@ -0,0 +1,209 @@
1
+ import random
2
+
3
+ from radnn import mlsys, FileSystem
4
+ from openpyxl import load_workbook
5
+ from datasets import TextQuestionSample, CytaChatbotDataset, StratifiedCytaChatbotDataset
6
+ from chatgpt import ChatGPTAPI
7
+ import re
8
+ from tqdm import tqdm
9
+ import ast
10
+ import numpy as np
11
+
12
+ LANG = "EL"
13
+
14
+ def split_alt_id_question(question):
15
+ result = []
16
+ match = re.match(r"(\d+)\.\s*(.+)", question)
17
+ if match:
18
+ result.append(match.group(1))
19
+ result.append(match.group(2))
20
+ return result
21
+
22
+
23
+ API_KEY = "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"
24
+ oAPI = ChatGPTAPI(API_KEY)
25
+
26
+
27
+
28
+ mlsys.filesys = FileSystem()
29
+ oDataset = CytaChatbotDataset(mlsys.filesys.datasets.subfs("CYTACHATBOT"))
30
+ if not oDataset.load_question_answers_aug(LANG):
31
+ print("[>] Attaching alternative questions ...")
32
+ oDataset.load_question_answers(LANG)
33
+
34
+
35
+ # Calculate the difference between consecutive elements
36
+ oIds = np.asarray([oSample.id for oSample in oDataset], dtype=np.int32)
37
+ diffs = np.diff(oIds)
38
+
39
+ # Find indices where the difference is more than 1 (indicates a gap)
40
+ gap_indices = np.where(diffs > 1)[0]
41
+ if len(gap_indices) > 0:
42
+ print("[x] Gaps in sample numbering:")
43
+ # Report the missing numbers
44
+ for index in gap_indices:
45
+ start = oIds[index] + 1
46
+ end = oIds[index + 1] - 1
47
+ missing = list(range(start, end + 1))
48
+ print(f"|__ Missing: {missing}")
49
+
50
+ if oDataset.fs.subfs("source").exists(f"CytaChatbot_v1_Targets-{LANG}.xlsx"):
51
+ print("|__ Attaching annotations")
52
+ sImportFileName = oDataset.fs.subfs("source").file(f"CytaChatbot_v1_Targets-{LANG}.xlsx")
53
+ oWorkbook = load_workbook(sImportFileName)
54
+ oSheet = oWorkbook.active # Or wb['SheetName']
55
+ oTargets = []
56
+ for nIndex, oRow in enumerate(oSheet.iter_rows(values_only=True)):
57
+ if (nIndex > 0) and (oRow[0] is not None):
58
+ try:
59
+ nOnehot = np.asarray(oRow[2:5], dtype=np.int32)
60
+ except Exception as e:
61
+ print(nIndex, oRow)
62
+ raise
63
+ assert nOnehot.sum() == 1, "More than one tags"
64
+
65
+ nID = nIndex - 1
66
+ sID = oRow[0]
67
+ sSample = oRow[1].strip().replace("?", ";")
68
+ sSampleV2 = oDataset[nID].question.strip().replace("?", ";")
69
+ nTarget = np.argmax(nOnehot)
70
+ oTargets.append(nTarget)
71
+ oDataset[nID].annotations = nTarget
72
+ #if sSample != sSampleV2:
73
+ # print(nSampleID, sSample, "!=", sSampleV2, nTarget)
74
+ #else:
75
+ #print(nSampleID, sSample, nTarget)
76
+ else:
77
+ # TODO: EN Annotations
78
+ random.seed(2025)
79
+ oTargets = []
80
+ for nIndex, oTextRecord in enumerate(oDataset):
81
+ nTarget = random.randint(0, 2)
82
+ oTargets.append(nTarget)
83
+ oTextRecord.annotations = nTarget
84
+
85
+
86
+ print("|__ Adding augmentations")
87
+ if False:
88
+ EXTRA_QUESTIONS_COUNT = 30
89
+ for nIndex, oTextRecord in enumerate(tqdm(oDataset)):
90
+ try:
91
+ if LANG == "EL":
92
+ prompt = f"Γράψε {EXTRA_QUESTIONS_COUNT} παραλλαγές της ερώτησης: '{oTextRecord.question}', στην απάντηση βάλε μόνο τις παραλλαγές"
93
+ else:
94
+ prompt = f"Write {EXTRA_QUESTIONS_COUNT} variations of the question: '{oTextRecord.question}', keep only the variations in the answer"
95
+ response = oAPI.generate(prompt)
96
+ oTextRecord.question_alt = response.split('\n')
97
+
98
+ print(oTextRecord)
99
+ print(oTextRecord.question_alt)
100
+ except Exception as e:
101
+ new_questions = f"Error: {e}"
102
+
103
+ oDataset.save_question_answers_aug(LANG)
104
+ else:
105
+ mlsys.filesys = FileSystem()
106
+ oFS = mlsys.filesys.datasets.subfs("CYTACHATBOT")
107
+ sFileContents = oFS.text.load(f"Augmented_{LANG}.txt")
108
+ oFilteredLines = []
109
+ for sLine in sFileContents.splitlines():
110
+ if sLine.startswith("['") or sLine.startswith('["'):
111
+ oFilteredLines.append(sLine)
112
+
113
+ nSampleIndex = 0
114
+ for sLine in oFilteredLines:
115
+ oList = ast.literal_eval(sLine)
116
+ oListClean = []
117
+ for x in oList:
118
+ oParts = split_alt_id_question(x)
119
+ if len(oParts) > 1:
120
+ oListClean.append(oParts[1].strip().replace("?", ";"))
121
+ #print(sID, oListClean)
122
+ assert len(oListClean) == 30, "Wrong count of alternative questions"
123
+ oDataset[nSampleIndex].question_alt = oListClean
124
+ oDataset[nSampleIndex].question = oDataset[nSampleIndex].question.strip().replace("?", ";")
125
+ nSampleIndex += 1
126
+
127
+ oNewDataSet = CytaChatbotDataset(mlsys.filesys.datasets.subfs("CYTACHATBOT"))
128
+ for oSample in oDataset:
129
+ oNewDataSet.append(oSample)
130
+ oNewDataSet.save_question_answers_aug(LANG)
131
+ else:
132
+ oNewDataSet = oDataset
133
+ oTargets = [oSample.annotations for oSample in oDataset]
134
+
135
+ nClassHistogram, bin_edges = np.histogram(oTargets, bins=3)
136
+
137
+
138
+
139
+
140
+
141
+ def StratifiedBalancing(dataset_augmented, class_histogram=[89, 157, 358]):
142
+ nClassHistogram = class_histogram
143
+ nClasses = len(nClassHistogram)
144
+ nExtraSamples = 30
145
+ nMinorityClassCount = np.min(nClassHistogram)
146
+ nMaxSamples = nMinorityClassCount*(nExtraSamples + 1)
147
+ nTarget = (nMaxSamples // 50) * 50
148
+
149
+ nMaxSamplesPerClass = np.zeros(nClasses, np.int32)
150
+ nTargetClassHistogram = np.zeros(nClasses, np.int32)
151
+ nExtraSamplesPerClass = np.zeros(nClasses, np.int32)
152
+ nMinusOneSamplesCount = np.zeros(nClasses, np.int32)
153
+
154
+ for nIndex, nOriginalCount in enumerate(nClassHistogram):
155
+ if nOriginalCount != nMinorityClassCount:
156
+ nExtraSamplesCeil = int(np.ceil(nTarget / nOriginalCount)) - 1
157
+ nMaxSamplesPerClass[nIndex] = nOriginalCount * (nExtraSamplesCeil + 1)
158
+ nExtraSamplesPerClass[nIndex] = nExtraSamplesCeil
159
+ nTargetClassHistogram[nIndex] = nTarget
160
+ nMinusOneSamplesCount[nIndex] = nTarget - nMaxSamplesPerClass[nIndex]
161
+ else:
162
+ nMaxSamplesPerClass[nIndex] = nMaxSamples
163
+ nExtraSamplesPerClass[nIndex] = nExtraSamples
164
+ nTargetClassHistogram[nIndex] = nTarget
165
+ nMinusOneSamplesCount[nIndex] = nTarget - nMaxSamples
166
+
167
+
168
+
169
+ print(nClassHistogram)
170
+ print(nExtraSamplesPerClass)
171
+ print(nMaxSamplesPerClass)
172
+ print(nMinusOneSamplesCount)
173
+ print(nTargetClassHistogram)
174
+
175
+ nMinusOneLimit = nClassHistogram + nMinusOneSamplesCount
176
+ nClassOccurences = np.zeros(nClasses, np.int32)
177
+
178
+ oStratifiedDS = StratifiedCytaChatbotDataset(dataset_augmented.fs)
179
+
180
+ for oSample in dataset_augmented:
181
+ sAnswer = oSample.answer
182
+ nClassIndex = oSample.annotations
183
+ nClassOccurences[nClassIndex] += 1
184
+ nExtraSamples = nExtraSamplesPerClass[nClassIndex]
185
+ if nClassOccurences[nClassIndex] > nMinusOneLimit[nClassIndex]:
186
+ nExtraSamples -= 1
187
+
188
+ nBaseNewID = 1000000 + (oSample.id*1000)
189
+ oNewSample = TextQuestionSample(nBaseNewID, oSample.question, sAnswer, nClassIndex)
190
+ oQuestionSamples = list()
191
+ oStratifiedDS.class_questions[nClassIndex][oSample.id] = oQuestionSamples
192
+ oQuestionSamples.append(oNewSample)
193
+ for nExtraSampleIndex in range(nExtraSamples):
194
+ oNewSample = TextQuestionSample(nBaseNewID + 1 + nExtraSampleIndex, oSample.question_alt[nExtraSampleIndex], sAnswer, nClassIndex)
195
+ oQuestionSamples.append(oNewSample)
196
+
197
+
198
+ return oStratifiedDS
199
+
200
+
201
+
202
+ oStratifiedDS = StratifiedBalancing(oNewDataSet, nClassHistogram)
203
+ oStratifiedDS.save_questions(LANG)
204
+
205
+ if False:
206
+ for k, v in oStratifiedDS.class_questions[2].items():
207
+ for s in v:
208
+ print(s.question)
209
+
@@ -0,0 +1,17 @@
1
+ from radnn import mlsys, FileSystem
2
+ from datasets import StratifiedCytaChatbotDataset
3
+
4
+ mlsys.filesys = FileSystem()
5
+ oDataset = StratifiedCytaChatbotDataset(mlsys.filesys.datasets.subfs("CYTACHATBOT"))
6
+ if not oDataset.load_cache():
7
+ if oDataset.load_questions("EL"):
8
+ oDataset.split()
9
+ oDataset.print_info()
10
+ oDataset.save_cache()
11
+
12
+
13
+ for nIndex in range(oDataset.ts_sample_count):
14
+ print(f"{oDataset.ts_sample_ids[nIndex]}§{oDataset.ts_labels[nIndex]}§{oDataset.ts_samples[nIndex]}")
15
+ print("="*80)
16
+ for nIndex in range(oDataset.vs_sample_count):
17
+ print(f"{oDataset.vs_sample_ids[nIndex]}§{oDataset.vs_labels[nIndex]}§{oDataset.vs_samples[nIndex]}")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes