doctr-synth-generator 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/PKG-INFO +2 -2
  2. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/README.md +1 -1
  3. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/doctr_synth_generator.egg-info/PKG-INFO +2 -2
  4. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/corpus_downloader.py +1 -1
  5. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/doctr_dataset.py +26 -4
  6. doctr_synth_generator-0.2.1/generator/version.py +1 -0
  7. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/setup.py +1 -1
  8. doctr_synth_generator-0.2.0/generator/version.py +0 -1
  9. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/LICENSE +0 -0
  10. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/doctr_synth_generator.egg-info/SOURCES.txt +0 -0
  11. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/doctr_synth_generator.egg-info/dependency_links.txt +0 -0
  12. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/doctr_synth_generator.egg-info/requires.txt +0 -0
  13. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/doctr_synth_generator.egg-info/top_level.txt +0 -0
  14. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/doctr_synth_generator.egg-info/zip-safe +0 -0
  15. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/__init__.py +0 -0
  16. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/augmentations/__init__.py +0 -0
  17. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/augmentations/augmentation_pipeline.py +0 -0
  18. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/augmentations/random_blur.py +0 -0
  19. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/augmentations/random_gaussian_noise.py +0 -0
  20. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/augmentations/random_jpeg_compression.py +0 -0
  21. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/augmentations/random_perspective.py +0 -0
  22. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/augmentations/random_pixel_dropout.py +0 -0
  23. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/augmentations/random_rotate.py +0 -0
  24. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/__init__.py +0 -0
  25. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/background_downloader.py +0 -0
  26. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/background_manager.py +0 -0
  27. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/config.py +0 -0
  28. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/dataset_balancer.py +0 -0
  29. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/dataset_splitter.py +0 -0
  30. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/font_downloader.py +0 -0
  31. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/font_selector.py +0 -0
  32. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/generator.py +0 -0
  33. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/page_generator.py +0 -0
  34. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/text_renderer.py +0 -0
  35. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/text_styling.py +0 -0
  36. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/vocab_coverage.py +0 -0
  37. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/components/vocabs.py +0 -0
  38. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/generator/dataset_generator.py +0 -0
  39. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/pyproject.toml +0 -0
  40. {doctr_synth_generator-0.2.0 → doctr_synth_generator-0.2.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doctr-synth-generator
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: A synthetic data generator for training OCR models
5
5
  Author-email: Felix Dittrich <felixdittrich92@gmail.com>
6
6
  Maintainer: Felix Dittrich
@@ -250,7 +250,7 @@ Dynamic: license-file
250
250
  ![Build Status](https://github.com/felixdittrich92/docTR-Synth-Generator/workflows/builds/badge.svg)
251
251
  [![codecov](https://codecov.io/gh/felixdittrich92/docTR-Synth-Generator/graph/badge.svg?token=31MDR20JGI)](https://codecov.io/gh/felixdittrich92/docTR-Synth-Generator)
252
252
  [![CodeFactor](https://www.codefactor.io/repository/github/felixdittrich92/doctr-synth-generator/badge)](https://www.codefactor.io/repository/github/felixdittrich92/doctr-synth-generator)
253
- [![Pypi](https://img.shields.io/badge/pypi-v0.1.0-blue.svg)](https://pypi.org/project/docTR-Synth-Generator/)
253
+ [![Pypi](https://img.shields.io/badge/pypi-v0.2.1-blue.svg)](https://pypi.org/project/docTR-Synth-Generator/)
254
254
 
255
255
  # docTR-Synth-Generator
256
256
  A tool to generate synthetic OCR datasets - made for docTR
@@ -2,7 +2,7 @@
2
2
  ![Build Status](https://github.com/felixdittrich92/docTR-Synth-Generator/workflows/builds/badge.svg)
3
3
  [![codecov](https://codecov.io/gh/felixdittrich92/docTR-Synth-Generator/graph/badge.svg?token=31MDR20JGI)](https://codecov.io/gh/felixdittrich92/docTR-Synth-Generator)
4
4
  [![CodeFactor](https://www.codefactor.io/repository/github/felixdittrich92/doctr-synth-generator/badge)](https://www.codefactor.io/repository/github/felixdittrich92/doctr-synth-generator)
5
- [![Pypi](https://img.shields.io/badge/pypi-v0.1.0-blue.svg)](https://pypi.org/project/docTR-Synth-Generator/)
5
+ [![Pypi](https://img.shields.io/badge/pypi-v0.2.1-blue.svg)](https://pypi.org/project/docTR-Synth-Generator/)
6
6
 
7
7
  # docTR-Synth-Generator
8
8
  A tool to generate synthetic OCR datasets - made for docTR
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doctr-synth-generator
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: A synthetic data generator for training OCR models
5
5
  Author-email: Felix Dittrich <felixdittrich92@gmail.com>
6
6
  Maintainer: Felix Dittrich
@@ -250,7 +250,7 @@ Dynamic: license-file
250
250
  ![Build Status](https://github.com/felixdittrich92/docTR-Synth-Generator/workflows/builds/badge.svg)
251
251
  [![codecov](https://codecov.io/gh/felixdittrich92/docTR-Synth-Generator/graph/badge.svg?token=31MDR20JGI)](https://codecov.io/gh/felixdittrich92/docTR-Synth-Generator)
252
252
  [![CodeFactor](https://www.codefactor.io/repository/github/felixdittrich92/doctr-synth-generator/badge)](https://www.codefactor.io/repository/github/felixdittrich92/doctr-synth-generator)
253
- [![Pypi](https://img.shields.io/badge/pypi-v0.1.0-blue.svg)](https://pypi.org/project/docTR-Synth-Generator/)
253
+ [![Pypi](https://img.shields.io/badge/pypi-v0.2.1-blue.svg)](https://pypi.org/project/docTR-Synth-Generator/)
254
254
 
255
255
  # docTR-Synth-Generator
256
256
  A tool to generate synthetic OCR datasets - made for docTR
@@ -263,5 +263,5 @@ def generate_numeric_tokens(n: int, seed: int | None = None) -> list[str]:
263
263
  elif kind == "percent":
264
264
  tokens.append(f"{rng.randint(0, 100)}%")
265
265
  else: # phone
266
- tokens.append(f"+{rng.randint(1, 99)} {rng.randint(100, 999)} {rng.randint(100000, 9999999)}")
266
+ tokens.append(f"+{rng.randint(1, 99)}{rng.randint(100, 999)}{rng.randint(100000, 9999999)}")
267
267
  return tokens
@@ -20,11 +20,31 @@ from .dataset_generator import SyntheticDatasetGenerator
20
20
 
21
21
  try: # docTR's single-class name ("words"); fall back to the same literal if absent.
22
22
  from doctr.file_utils import CLASS_NAME # type: ignore[import-not-found]
23
+ from doctr.utils import Sample # type: ignore[import-not-found]
23
24
  except Exception: # pragma: no cover - docTR not installed
25
+ from dataclasses import dataclass
26
+
24
27
  CLASS_NAME = "words"
25
28
 
29
+ @dataclass
30
+ class Sample: # type: ignore[no-redef]
31
+ """Canonical data container for all transforms."""
32
+
33
+ image: Any
34
+ mask: Any | None = None
35
+ target: np.ndarray | dict[str, np.ndarray] | None = None
36
+
37
+ def replace(self, **kwargs) -> "Sample":
38
+ return Sample(
39
+ image=kwargs.get("image", self.image),
40
+ mask=kwargs.get("mask", self.mask),
41
+ target=kwargs.get("target", self.target),
42
+ )
43
+
44
+
26
45
  __all__ = [
27
46
  "CLASS_NAME",
47
+ "Sample",
28
48
  "polygons_to_target",
29
49
  "render_recognition_sample",
30
50
  "render_detection_sample",
@@ -118,7 +138,7 @@ def _torch():
118
138
  def _pil_to_tensor(img: Image.Image):
119
139
  """PIL RGB -> ``CxHxW`` float32 tensor in ``[0, 1]`` (matches docTR's reader)."""
120
140
  torch = _torch()
121
- arr = np.asarray(img.convert("RGB"), dtype=np.uint8)
141
+ arr = np.asarray(img.convert("RGB"), dtype=np.uint8, copy=True)
122
142
  return torch.from_numpy(arr).permute(2, 0, 1).contiguous().float().div_(255.0)
123
143
 
124
144
 
@@ -175,11 +195,13 @@ class _BaseSynthDataset:
175
195
  self._seed_sample(index)
176
196
  img, target = self._render(index)
177
197
  tensor = _pil_to_tensor(img)
198
+ sample = Sample(image=tensor, target=target)
178
199
  if self.img_transforms is not None:
179
- tensor = self.img_transforms(tensor)
200
+ sample = self.img_transforms(sample)
180
201
  if self.sample_transforms is not None:
181
- tensor, target = self.sample_transforms(tensor, target)
182
- return tensor, target
202
+ sample = self.sample_transforms(sample)
203
+ # Keep compatibility with the existing collate_fn
204
+ return sample.image, sample.target
183
205
 
184
206
  @staticmethod
185
207
  def collate_fn(samples):
@@ -0,0 +1 @@
1
+ __version__ = 'v0.2.1'
@@ -9,7 +9,7 @@ from pathlib import Path
9
9
  from setuptools import setup
10
10
 
11
11
  PKG_NAME = "doctr-synth-generator"
12
- VERSION = os.getenv("BUILD_VERSION", "0.2.0a0")
12
+ VERSION = os.getenv("BUILD_VERSION", "0.2.1a0")
13
13
 
14
14
 
15
15
  if __name__ == "__main__":
@@ -1 +0,0 @@
1
- __version__ = 'v0.2.0'