sciveo 0.1.39__tar.gz → 0.1.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {sciveo-0.1.39 → sciveo-0.1.40}/PKG-INFO +2 -1
  2. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/pipeline.py +1 -0
  3. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/generators.py +51 -1
  4. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/images/description.py +47 -5
  5. sciveo-0.1.40/sciveo/ml/nlp/tokenizers/bpe.py +76 -0
  6. sciveo-0.1.40/sciveo/tools/aws/__init__.py +0 -0
  7. sciveo-0.1.40/sciveo/version.py +2 -0
  8. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo.egg-info/PKG-INFO +2 -1
  9. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo.egg-info/SOURCES.txt +4 -1
  10. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo.egg-info/requires.txt +4 -0
  11. {sciveo-0.1.39 → sciveo-0.1.40}/setup.py +5 -1
  12. sciveo-0.1.40/test/test_tokenizers.py +34 -0
  13. sciveo-0.1.39/sciveo/version.py +0 -2
  14. {sciveo-0.1.39 → sciveo-0.1.40}/README.md +0 -0
  15. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/__init__.py +0 -0
  16. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/api/__init__.py +0 -0
  17. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/api/base.py +0 -0
  18. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/api/upload.py +0 -0
  19. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/cli.py +0 -0
  20. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/common/__init__.py +0 -0
  21. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/common/configuration.py +0 -0
  22. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/common/model.py +0 -0
  23. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/common/optimizers.py +0 -0
  24. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/common/sampling.py +0 -0
  25. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/content/__init__.py +0 -0
  26. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/content/dataset.py +0 -0
  27. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/content/experiment.py +0 -0
  28. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/content/project.py +0 -0
  29. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/content/runner.py +0 -0
  30. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/__init__.py +0 -0
  31. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/__init__.py +0 -0
  32. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/base.py +0 -0
  33. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/encoders/__init__.py +0 -0
  34. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/encoders/base.py +0 -0
  35. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/encoders/normalizer.py +0 -0
  36. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/nlp/__init__.py +0 -0
  37. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/nlp/search.py +0 -0
  38. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/time_series/__init__.py +0 -0
  39. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/time_series/dataset.py +0 -0
  40. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/time_series/predictor.py +0 -0
  41. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/time_series/trainer.py +0 -0
  42. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/ml/time_series/window_generator.py +0 -0
  43. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/__init__.py +0 -0
  44. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/base.py +0 -0
  45. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/job_daemon.py +0 -0
  46. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/layouts/__init__.py +0 -0
  47. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/layouts/base.py +0 -0
  48. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/postprocessors/__init__.py +0 -0
  49. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/postprocessors/base.py +0 -0
  50. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/postprocessors/default.py +0 -0
  51. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/__init__.py +0 -0
  52. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/audio/__init__.py +0 -0
  53. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/audio/audio.py +0 -0
  54. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/audio/audio_extractor_process.py +0 -0
  55. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/aws.py +0 -0
  56. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/base.py +0 -0
  57. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/file/__init__.py +0 -0
  58. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/file/archive.py +0 -0
  59. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/__init__.py +0 -0
  60. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/album.py +0 -0
  61. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/album_in_image.py +0 -0
  62. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/depth_esimation.py +0 -0
  63. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/embeddings.py +0 -0
  64. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/filters.py +0 -0
  65. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/histogram.py +0 -0
  66. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/mask.py +0 -0
  67. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/object_detection.py +0 -0
  68. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/resize.py +0 -0
  69. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/segmentation.py +0 -0
  70. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/image/watermark.py +0 -0
  71. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/media_info.py +0 -0
  72. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/nlp/__init__.py +0 -0
  73. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/nlp/address.py +0 -0
  74. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/qr.py +0 -0
  75. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/sci/__init__.py +0 -0
  76. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/sci/base.py +0 -0
  77. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/sci/dataset.py +0 -0
  78. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/sci/time_series/__init__.py +0 -0
  79. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/sci/time_series/predictor.py +0 -0
  80. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/sci/time_series/trainer.py +0 -0
  81. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/tpu_base.py +0 -0
  82. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/video/__init__.py +0 -0
  83. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/video/generators.py +0 -0
  84. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/video/motion_detection.py +0 -0
  85. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/video/resize.py +0 -0
  86. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/video/video_album.py +0 -0
  87. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/video/video_frames.py +0 -0
  88. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/processors/video/video_resample.py +0 -0
  89. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/queues.py +0 -0
  90. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/server.py +0 -0
  91. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/web/__init__.py +0 -0
  92. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/media/pipelines/web/server.py +0 -0
  93. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/__init__.py +0 -0
  94. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/base.py +0 -0
  95. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/evaluation/__init__.py +0 -0
  96. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/evaluation/object_detection.py +0 -0
  97. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/images/__init__.py +0 -0
  98. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/images/base.py +0 -0
  99. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/images/embeddings.py +0 -0
  100. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/images/object_detection.py +0 -0
  101. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/images/tools.py +0 -0
  102. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/images/transformers.py +0 -0
  103. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/nlp/__init__.py +0 -0
  104. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/nlp/embeddings.py +0 -0
  105. {sciveo-0.1.39/sciveo/ml/video → sciveo-0.1.40/sciveo/ml/nlp/tokenizers}/__init__.py +0 -0
  106. {sciveo-0.1.39/sciveo/monitoring → sciveo-0.1.40/sciveo/ml/video}/__init__.py +0 -0
  107. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/ml/video/description.py +0 -0
  108. {sciveo-0.1.39/sciveo/network → sciveo-0.1.40/sciveo/monitoring}/__init__.py +0 -0
  109. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/monitoring/monitor.py +0 -0
  110. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/monitoring/start.py +0 -0
  111. {sciveo-0.1.39/sciveo/tools → sciveo-0.1.40/sciveo/network}/__init__.py +0 -0
  112. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/network/camera.py +0 -0
  113. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/network/sniffer.py +0 -0
  114. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/network/tools.py +0 -0
  115. {sciveo-0.1.39/sciveo/tools/aws → sciveo-0.1.40/sciveo/tools}/__init__.py +0 -0
  116. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/array.py +0 -0
  117. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/aws/priority_queue.py +0 -0
  118. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/aws/s3.py +0 -0
  119. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/common.py +0 -0
  120. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/complexity.py +0 -0
  121. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/compress.py +0 -0
  122. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/configuration.py +0 -0
  123. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/crypto.py +0 -0
  124. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/daemon.py +0 -0
  125. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/formating.py +0 -0
  126. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/hardware.py +0 -0
  127. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/http.py +0 -0
  128. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/logger.py +0 -0
  129. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/os.py +0 -0
  130. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/random.py +0 -0
  131. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/remote.py +0 -0
  132. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/simple_counter.py +0 -0
  133. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/synchronized.py +0 -0
  134. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo/tools/timers.py +0 -0
  135. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo.egg-info/dependency_links.txt +0 -0
  136. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo.egg-info/entry_points.txt +0 -0
  137. {sciveo-0.1.39 → sciveo-0.1.40}/sciveo.egg-info/top_level.txt +0 -0
  138. {sciveo-0.1.39 → sciveo-0.1.40}/setup.cfg +0 -0
  139. {sciveo-0.1.39 → sciveo-0.1.40}/test/test_complexity.py +0 -0
  140. {sciveo-0.1.39 → sciveo-0.1.40}/test/test_compress.py +0 -0
  141. {sciveo-0.1.39 → sciveo-0.1.40}/test/test_configuration.py +0 -0
  142. {sciveo-0.1.39 → sciveo-0.1.40}/test/test_crypto.py +0 -0
  143. {sciveo-0.1.39 → sciveo-0.1.40}/test/test_monitoring.py +0 -0
  144. {sciveo-0.1.39 → sciveo-0.1.40}/test/test_runner.py +0 -0
  145. {sciveo-0.1.39 → sciveo-0.1.40}/test/test_sampling.py +0 -0
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sciveo
3
- Version: 0.1.39
3
+ Version: 0.1.40
4
4
  Description-Content-Type: text/markdown
5
5
  Provides-Extra: mon
6
6
  Provides-Extra: net
7
7
  Provides-Extra: media
8
8
  Provides-Extra: media-server
9
9
  Provides-Extra: media-ml
10
+ Provides-Extra: media-ml-all
10
11
  Provides-Extra: all
11
12
  Provides-Extra: media-all
12
13
 
@@ -81,6 +81,7 @@ class MediaPipeline:
81
81
  "image-diffusion": ImageDiffusionText,
82
82
  "image-diffusion-image-text": ImageDiffusionImageText,
83
83
  "image-to-text": ImageToTextProcessor,
84
+ "image-query": ImageQueryProcessor,
84
85
  "image-fgbg-filter": ImageFGBGFilter,
85
86
  "image-segmentation": ImageSegmentation,
86
87
  "image-depth-estimation": ImageDepthEstimation,
@@ -24,7 +24,7 @@ from sciveo.tools.logger import *
24
24
  from sciveo.tools.common import *
25
25
  from sciveo.media.pipelines.processors.tpu_base import *
26
26
  from sciveo.media.pipelines.base import ApiContent
27
- from sciveo.ml.images.description import ImageToText
27
+ from sciveo.ml.images.description import ImageToText, ImageQuery
28
28
 
29
29
 
30
30
  class ImageDiffusionText(TPUBaseProcessor):
@@ -202,6 +202,56 @@ class ImageToTextProcessor(TPUBaseProcessor):
202
202
  return True
203
203
 
204
204
 
205
+ class ImageQueryProcessor(TPUBaseProcessor):
206
+ def __init__(self, processor_config, max_progress) -> None:
207
+ super().__init__(processor_config, max_progress)
208
+
209
+ self.api = ApiContent()
210
+
211
+ self.cache_dir = os.path.join(os.environ['MEDIA_MODELS_BASE_PATH'], "models/")
212
+ self.device = os.environ.get("MEDIA_PROCESSING_BACKEND", "cpu")
213
+
214
+ self.default.update({
215
+ "max_length": 64,
216
+ "model_id": 0,
217
+ "query": "describe the image",
218
+ "output": False
219
+ })
220
+
221
+ self.predictor = None
222
+
223
+ def process(self, media):
224
+ debug("process", media['guid'])
225
+ if self.predictor is None:
226
+ self.predictor = ImageQuery(self['query'], self['model_id'], self['max_length'], self.cache_dir, self.device)
227
+ local_path = media["local_path"]
228
+ predict = self.predictor.predict_one(local_path)
229
+ return self.set_media(media, predict)
230
+
231
+ def set_media(self, media, predict):
232
+ media.setdefault("next", [])
233
+ media["next"].append({
234
+ "guid": f"TXT-{media['guid']}",
235
+ "parent": media['guid'],
236
+ "content_type": "comment",
237
+ "content_text": predict,
238
+ "owner": media["owner"],
239
+ "name": f"{predict} [{self['max_length']}]",
240
+ "processor": self.name()
241
+ })
242
+
243
+ return media
244
+
245
+ def content_type(self):
246
+ return "image"
247
+
248
+ def name(self):
249
+ return "image-query"
250
+
251
+ def is_append_processor(self):
252
+ return True
253
+
254
+
205
255
  class ImageDiffusionImageText(TPUBaseProcessor):
206
256
  def __init__(self, processor_config, max_progress) -> None:
207
257
  super().__init__(processor_config, max_progress)
@@ -33,12 +33,14 @@ class ImageToText(BaseImageML):
33
33
  self.models = [
34
34
  ["GIT", "softel/git-base-v1.0", "auto"],
35
35
  ["GIT", "softel/git-large-v1.0", "auto"],
36
+ ["CAPTION", "softel/image-instruct-v2.0", "auto"],
36
37
  ["BLIP2", "softel/blip2-opt-2.7b-v1.0", torch.float16],
37
38
  # ["BLIP2", "softel/blip2-opt-6.7b-v1.0", torch.float16],
38
39
  ]
39
40
 
40
41
  model_config = self.models[model_id]
41
42
  self.dtype = model_config[2]
43
+ self.model_type = model_config[0]
42
44
 
43
45
  if model_config[0] == "GIT":
44
46
  self.pipe = AutoProcessor.from_pretrained(model_config[1], cache_dir=self.cache_dir)
@@ -46,6 +48,9 @@ class ImageToText(BaseImageML):
46
48
  elif model_config[0] == "BLIP2":
47
49
  self.pipe = Blip2Processor.from_pretrained(model_config[1], cache_dir=self.cache_dir)
48
50
  self.model = Blip2ForConditionalGeneration.from_pretrained(model_config[1], torch_dtype=self.dtype, device_map="auto", cache_dir=self.cache_dir)
51
+ elif model_config[0] == "CAPTION":
52
+ self.model = AutoModelForCausalLM.from_pretrained(model_config[1], torch_dtype=self.dtype, device_map="auto", cache_dir=self.cache_dir, trust_remote_code=True)
53
+ self.pipe = self.model
49
54
 
50
55
  debug("model name", model_config[1], "on device", self.device, "dtype", self.dtype, self.model.dtype)
51
56
  self.dtype = self.model.dtype
@@ -53,12 +58,49 @@ class ImageToText(BaseImageML):
53
58
  def predict(self, images):
54
59
  images = self.load(images)
55
60
 
56
- pixel_values = self.pipe(images=images, return_tensors="pt").pixel_values.to(self.device, self.dtype)
57
- ids = self.model.generate(pixel_values=pixel_values, max_length=self.max_length)
58
- prediction = self.pipe.batch_decode(ids, skip_special_tokens=True)
61
+ if self.model_type == "CAPTION":
62
+ prediction = []
63
+ if self.max_length < 64:
64
+ predict_length = "short"
65
+ else:
66
+ predict_length = "normal"
67
+ for image in images:
68
+ prediction.append(self.model.caption(image, length=predict_length))
69
+ else:
70
+ pixel_values = self.pipe(images=images, return_tensors="pt").pixel_values.to(self.device, self.dtype)
71
+ ids = self.model.generate(pixel_values=pixel_values, max_length=self.max_length)
72
+ prediction = self.pipe.batch_decode(ids, skip_special_tokens=True)
59
73
 
60
- del ids
61
- del pixel_values
74
+ del ids
75
+ del pixel_values
62
76
 
63
77
  # debug("image description", prediction)
64
78
  return prediction
79
+
80
+
81
+ class ImageQuery(BaseImageML):
82
+ def __init__(self, query, model_id, max_length=64, cache_dir=None, device=None) -> None:
83
+ super().__init__(model_id, cache_dir, device)
84
+ self.query = query
85
+ self.max_length = max_length
86
+
87
+ self.models = [
88
+ ["CAPTION", "softel/image-instruct-v2.0", "auto"],
89
+ # ["BLIP2", "softel/blip2-opt-2.7b-v1.0", torch.float16],
90
+ # ["BLIP2", "softel/blip2-opt-6.7b-v1.0", torch.float16],
91
+ ]
92
+
93
+ model_config = self.models[model_id]
94
+ self.dtype = model_config[2]
95
+ self.model_type = model_config[0]
96
+
97
+ if model_config[0] == "CAPTION":
98
+ self.model = AutoModelForCausalLM.from_pretrained(model_config[1], torch_dtype=self.dtype, device_map="auto", cache_dir=self.cache_dir, trust_remote_code=True)
99
+ self.pipe = self.model
100
+
101
+ debug("model name", model_config[1], "on device", self.device, "dtype", self.dtype, self.model.dtype)
102
+ self.dtype = self.model.dtype
103
+
104
+ def predict_one(self, x):
105
+ image = self.load_image(x)
106
+ return self.model.query(image, self.query)["answer"]
@@ -0,0 +1,76 @@
1
+ #
2
+ # Pavlin Georgiev, Softel Labs
3
+ #
4
+ # This is a proprietary file and may not be copied,
5
+ # distributed, or modified without express permission
6
+ # from the owner. For licensing inquiries, please
7
+ # contact pavlin@softel.bg.
8
+ #
9
+ # 2024
10
+ #
11
+
12
+ import re
13
+
14
+ from sciveo.tools.logger import *
15
+
16
+
17
+ class BPETokenizer:
18
+ def __init__(self, max_size):
19
+ self.initial_tokens = 256
20
+ self.max_size = max_size
21
+ self.max_merges = max_size - self.initial_tokens
22
+ self.vocab = {}
23
+ self.merges = {}
24
+
25
+ def encode(self, text):
26
+ tokens = list(map(int, text.encode("utf-8")))
27
+ l1 = len(tokens)
28
+ for k, v in self.merges.items():
29
+ self.merge(tokens, k, v)
30
+ debug(f"encoded ratio {len(tokens) / l1:.2}x")
31
+ return tokens
32
+
33
+ def decode_token(self, token):
34
+ if token not in self.vocab:
35
+ return [token]
36
+
37
+ bigram = self.vocab[token]
38
+ return self.decode_token(bigram[0]) + self.decode_token(bigram[1])
39
+
40
+ def decode(self, tokens):
41
+ decoded = []
42
+ for token in tokens:
43
+ decoded += self.decode_token(token)
44
+ return bytes(decoded).decode("utf-8", errors="replace")
45
+
46
+ def train(self, text, debug_step=100):
47
+ tokens = list(map(int, text.encode("utf-8")))
48
+ token_id = self.initial_tokens
49
+ debug("max_merges", self.max_merges)
50
+ while(len(self.merges) < self.max_merges):
51
+ current_counts = self.counts(tokens)
52
+ bigram = max(current_counts, key=current_counts.get)
53
+ self.merge(tokens, bigram, token_id)
54
+ self.merges[bigram] = token_id
55
+ self.vocab[token_id] = bigram
56
+ token_id += 1
57
+ if len(self.merges) % debug_step == 0:
58
+ debug("train", f"{len(self.merges)}/{self.max_merges}")
59
+
60
+ def fit(self, x):
61
+ return self.train(x)
62
+
63
+ def counts(self, tokens):
64
+ result = {}
65
+ for bigram in zip(tokens, tokens[1:]):
66
+ result.setdefault(bigram, 0)
67
+ result[bigram] += 1
68
+ return result
69
+
70
+ def merge(self, tokens, bigram, token_id):
71
+ i = 0
72
+ while i < len(tokens) - 1:
73
+ if tokens[i] == bigram[0] and tokens[i + 1] == bigram[1]:
74
+ tokens[i] = token_id
75
+ del tokens[i + 1]
76
+ i += 1
File without changes
@@ -0,0 +1,2 @@
1
+
2
+ __version__ = '0.1.40'
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sciveo
3
- Version: 0.1.39
3
+ Version: 0.1.40
4
4
  Description-Content-Type: text/markdown
5
5
  Provides-Extra: mon
6
6
  Provides-Extra: net
7
7
  Provides-Extra: media
8
8
  Provides-Extra: media-server
9
9
  Provides-Extra: media-ml
10
+ Provides-Extra: media-ml-all
10
11
  Provides-Extra: all
11
12
  Provides-Extra: media-all
12
13
 
@@ -100,6 +100,8 @@ sciveo/ml/images/tools.py
100
100
  sciveo/ml/images/transformers.py
101
101
  sciveo/ml/nlp/__init__.py
102
102
  sciveo/ml/nlp/embeddings.py
103
+ sciveo/ml/nlp/tokenizers/__init__.py
104
+ sciveo/ml/nlp/tokenizers/bpe.py
103
105
  sciveo/ml/video/__init__.py
104
106
  sciveo/ml/video/description.py
105
107
  sciveo/monitoring/__init__.py
@@ -136,4 +138,5 @@ test/test_configuration.py
136
138
  test/test_crypto.py
137
139
  test/test_monitoring.py
138
140
  test/test_runner.py
139
- test/test_sampling.py
141
+ test/test_sampling.py
142
+ test/test_tokenizers.py
@@ -69,6 +69,10 @@ accelerate>=0.0.0
69
69
  annoy>=0.0.0
70
70
  ultralytics>=0.0.0
71
71
 
72
+ [media-ml-all]
73
+ pyvips>=0.0.0
74
+ einops>=0.0.0
75
+
72
76
  [media-server]
73
77
  fastapi
74
78
  uvicorn[standard]
@@ -30,7 +30,11 @@ extras_require = {
30
30
  'tensorflow>=0.0.0', 'keras>=0.0.0',
31
31
  'torch>=0.0.0', 'torchvision>=0.0.0',
32
32
  'diffusers>=0.0.0', 'transformers>=0.0.0', 'accelerate>=0.0.0', 'annoy>=0.0.0',
33
- 'ultralytics>=0.0.0'
33
+ 'ultralytics>=0.0.0',
34
+ ],
35
+ 'media-ml-all': [
36
+ 'pyvips>=0.0.0',
37
+ 'einops>=0.0.0'
34
38
  ]
35
39
  }
36
40
 
@@ -0,0 +1,34 @@
1
+ #
2
+ # Pavlin Georgiev, Softel Labs
3
+ #
4
+ # This is a proprietary file and may not be copied,
5
+ # distributed, or modified without express permission
6
+ # from the owner. For licensing inquiries, please
7
+ # contact pavlin@softel.bg.
8
+ #
9
+ # 2024
10
+ #
11
+
12
+ import math
13
+ import unittest
14
+
15
+ from sciveo.ml.nlp.tokenizers.bpe import *
16
+
17
+
18
+ class TestTokenizers(unittest.TestCase):
19
+ def test_BPE(self):
20
+ text = "節樂,《漢語大詞典》一則:「《史記.樂書》:凡作樂者,所以節槳。張守義正義:音洛,言不樂至荒淫也, 網站有中、英文版本,也有繁、簡體版,可通過每頁左上角的連結隨時調整 Unicode! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?)"
21
+ text += "Using a row in the above table to encode a code point less than 'First code point' (thus using more bytes than necessary) is termed an overlong encoding. These are a security problem because they allow the same code point to be encoded in multiple ways. Overlong encodings (of ../ for example) have been used to bypass security validations in high-profile products including Microsoft's IIS web server[14] and Apache's Tomcat servlet container.[15] Overlong encodings should therefore be considered an error and never decoded. Modified UTF-8 allows an overlong encoding of U+0000."
22
+
23
+ T = BPETokenizer(max_size=512)
24
+ T.train(text)
25
+
26
+ t = "你好世界,美好的一天"
27
+ self.assertTrue(T.decode(T.encode(t)) == t)
28
+
29
+ t = "hello world and testing"
30
+ self.assertTrue(T.decode(T.encode(t)) == t)
31
+
32
+
33
+ if __name__ == '__main__':
34
+ unittest.main()
@@ -1,2 +0,0 @@
1
-
2
- __version__ = '0.1.39'
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes