speedy-utils 1.1.46__tar.gz → 1.1.48__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. speedy_utils-1.1.48/.github/skills/dataset-processing-multiprocessing/CONFIG_REFERENCE.md +217 -0
  2. speedy_utils-1.1.48/.github/skills/dataset-processing-multiprocessing/SKILL.md +797 -0
  3. speedy_utils-1.1.48/.github/skills/dataset-processing-multiprocessing/example_tokenize_pack.py +323 -0
  4. speedy_utils-1.1.48/.github/skills/edit-llm-inference-style/SKILL.md +133 -0
  5. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/multi-threading-processing/SKILL.md +38 -0
  6. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/PKG-INFO +1 -1
  7. speedy_utils-1.1.48/docs/SKILL.md +415 -0
  8. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/pyproject.toml +9 -1
  9. speedy_utils-1.1.48/scripts/bug.py +16 -0
  10. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/__init__.py +1 -3
  11. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/chat_format/__init__.py +0 -2
  12. speedy_utils-1.1.48/src/llm_utils/chat_format/display.py +384 -0
  13. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/llm.py +62 -22
  14. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/__init__.py +4 -0
  15. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/multi_worker/__init__.py +4 -0
  16. speedy_utils-1.1.48/src/speedy_utils/multi_worker/_multi_process.py +425 -0
  17. speedy_utils-1.1.48/src/speedy_utils/multi_worker/_multi_process_ray.py +308 -0
  18. speedy_utils-1.1.48/src/speedy_utils/multi_worker/common.py +879 -0
  19. speedy_utils-1.1.48/src/speedy_utils/multi_worker/dataset_sharding.py +203 -0
  20. speedy_utils-1.1.48/src/speedy_utils/multi_worker/process.py +128 -0
  21. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/multi_worker/progress.py +71 -1
  22. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/multi_worker/thread.py +45 -0
  23. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/scripts/mpython.py +19 -12
  24. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/llm_utils/test_llm_mixins.py +13 -2
  25. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_memoize_typing.py +9 -2
  26. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_process.py +19 -15
  27. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_process_update.py +5 -8
  28. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_pytorch_sharing.py +9 -0
  29. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_shared_kwargs.py +11 -0
  30. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_thread.py +2 -3
  31. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_tokenization.py +8 -1
  32. speedy_utils-1.1.46/scripts/bug.py +0 -12
  33. speedy_utils-1.1.46/src/llm_utils/chat_format/display.py +0 -465
  34. speedy_utils-1.1.46/src/speedy_utils/multi_worker/process.py +0 -1309
  35. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.githooks/pre-push +0 -0
  36. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/copilot-instructions.md +0 -0
  37. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/prompts/improveParallelErrorHandling.prompt.md +0 -0
  38. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/caching-utilities/SKILL.md +0 -0
  39. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/caching-utilities/examples/caching_example.py +0 -0
  40. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/io-utilities/SKILL.md +0 -0
  41. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/io-utilities/examples/io_example.py +0 -0
  42. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/llm-integration/SKILL.md +0 -0
  43. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/llm-integration/examples/llm_example.py +0 -0
  44. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/ray-distributed-computing/SKILL.md +0 -0
  45. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/skill-creation/SKILL.md +0 -0
  46. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/vision-utilities/SKILL.md +0 -0
  47. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/skills/vision-utilities/examples/vision_example.py +0 -0
  48. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.github/workflows/publish.yml +0 -0
  49. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.gitignore +0 -0
  50. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/.pre-commit-config.yaml +0 -0
  51. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/AGENTS.md +0 -0
  52. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/README.md +0 -0
  53. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/bumpversion.sh +0 -0
  54. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/debug/debug_generate_response.py +0 -0
  55. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/debug/debug_n_param.py +0 -0
  56. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/debug/debug_n_structure.py +0 -0
  57. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/debug/integration_test.py +0 -0
  58. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/debug/test_decode_api.py +0 -0
  59. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/debug/test_endpoints.py +0 -0
  60. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/debug/test_generate.py +0 -0
  61. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/debug/test_generate_endpoint.py +0 -0
  62. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/docs/GENERATE_QUICKREF.md +0 -0
  63. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/docs/IMPLEMENTATION.md +0 -0
  64. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/docs/QUICKSTART.md +0 -0
  65. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/docs/TOKENIZATION.md +0 -0
  66. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/docs/TOKENIZATION_IMPLEMENTATION.md +0 -0
  67. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/docs/zero_copy_sharing.md +0 -0
  68. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/examples/generate_example.py +0 -0
  69. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/examples/llm_ray_example.py +0 -0
  70. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/examples/pytorch_large_model.py +0 -0
  71. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/examples/shared_kwargs_example.py +0 -0
  72. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/examples/temperature_range_example.py +0 -0
  73. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/examples/test_parallel_gpu.py +0 -0
  74. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/examples/test_share_ray.py +0 -0
  75. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/examples/tokenization_example.py +0 -0
  76. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/examples/vision_utils_example.py +0 -0
  77. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/experiments/exp1/dockerfile +0 -0
  78. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/experiments/exp1/run_in_docker.sh +0 -0
  79. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/experiments/exp1/test.png +0 -0
  80. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/experiments/test_read_image.py +0 -0
  81. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/notebooks/README.ipynb +0 -0
  82. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/notebooks/llm_utils/llm_as_a_judge.ipynb +0 -0
  83. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/notebooks/parallel_gpu_pool.ipynb +0 -0
  84. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/notebooks/ray_tutorial.ipynb +0 -0
  85. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/notebooks/test_multi_thread.ipynb +0 -0
  86. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/ruff.toml +0 -0
  87. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/bug_simple.py +0 -0
  88. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/debug_import_time.py +0 -0
  89. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/deploy.sh +0 -0
  90. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/imports.sh +0 -0
  91. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/test.py +0 -0
  92. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/test_both_backends.py +0 -0
  93. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/test_error_handling.py +0 -0
  94. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/test_import_time_vision.py +0 -0
  95. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/test_locals.py +0 -0
  96. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/test_ray_locals.py +0 -0
  97. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/scripts/test_ray_mp.py +0 -0
  98. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/setup.cfg +0 -0
  99. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/datasets_utils/convert_to_arrow.py +0 -0
  100. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/chat_format/transform.py +0 -0
  101. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/chat_format/utils.py +0 -0
  102. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/group_messages.py +0 -0
  103. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/llm_ray.py +0 -0
  104. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/__init__.py +0 -0
  105. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/async_lm/__init__.py +0 -0
  106. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/async_lm/_utils.py +0 -0
  107. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/async_lm/async_llm_task.py +0 -0
  108. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/async_lm/async_lm.py +0 -0
  109. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/async_lm/async_lm_base.py +0 -0
  110. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/async_lm/lm_specific.py +0 -0
  111. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/base_prompt_builder.py +0 -0
  112. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/llm_signature.py +0 -0
  113. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/lm_base.py +0 -0
  114. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/mixins.py +0 -0
  115. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/openai_memoize.py +0 -0
  116. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/signature.py +0 -0
  117. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/lm/utils.py +0 -0
  118. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/scripts/README.md +0 -0
  119. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/scripts/fast_vllm.py +0 -0
  120. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/scripts/vllm_load_balancer.py +0 -0
  121. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/scripts/vllm_serve.py +0 -0
  122. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/vector_cache/__init__.py +0 -0
  123. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/vector_cache/cli.py +0 -0
  124. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/vector_cache/core.py +0 -0
  125. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/vector_cache/types.py +0 -0
  126. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/llm_utils/vector_cache/utils.py +0 -0
  127. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/__imports.py +0 -0
  128. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/__init__.py +0 -0
  129. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/clock.py +0 -0
  130. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/function_decorator.py +0 -0
  131. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/logger.py +0 -0
  132. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/notebook_utils.py +0 -0
  133. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/patcher.py +0 -0
  134. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/report_manager.py +0 -0
  135. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/utils_cache.py +0 -0
  136. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/utils_error.py +0 -0
  137. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/utils_io.py +0 -0
  138. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/utils_misc.py +0 -0
  139. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/common/utils_print.py +0 -0
  140. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/multi_worker/dataset_ray.py +0 -0
  141. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/multi_worker/parallel_gpu_pool.py +0 -0
  142. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/scripts/__init__.py +0 -0
  143. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/scripts/kill_mpython.py +0 -0
  144. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
  145. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/vision_utils/README.md +0 -0
  146. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/vision_utils/__init__.py +0 -0
  147. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/vision_utils/io_utils.py +0 -0
  148. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/src/vision_utils/plot.py +0 -0
  149. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/test.py +0 -0
  150. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/import_all.py +0 -0
  151. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/import_time_report.py +0 -0
  152. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/integration_test.py +0 -0
  153. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/sample_objects.py +0 -0
  154. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test.py +0 -0
  155. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_logger.py +0 -0
  156. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_logger_format.py +0 -0
  157. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_mpython.py +0 -0
  158. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/tests/test_multithread_error_trace.py +0 -0
  159. {speedy_utils-1.1.46 → speedy_utils-1.1.48}/uv.lock +0 -0
@@ -0,0 +1,217 @@
1
+ # Dataset Processing Configuration Reference
2
+
3
+ This document provides configuration templates for common dataset processing scenarios.
4
+
5
+ ## Configuration 1: Standard Tokenization (GPT2 Tokenizer)
6
+
7
+ ```python
8
+ python example_tokenize_pack.py \
9
+ --src /path/to/raw/dataset \
10
+ --dst /path/to/tokenized/dataset \
11
+ --tokenizer gpt2 \
12
+ --seq_len 2048 \
13
+ --workers 4 \
14
+ --backend mp
15
+ ```
16
+
17
+ **Use for:** Simple text tokenization for LLM fine-tuning
18
+
19
+ **Key parameters:**
20
+ - `seq_len=2048`: Standard context length
21
+ - `workers=4`: Adjust based on CPU cores
22
+ - `backend=mp`: Multiprocessing for CPU-bound work
23
+
24
+ ---
25
+
26
+ ## Configuration 2: Large-Scale Dataset (Ray Distributed)
27
+
28
+ ```python
29
+ python example_tokenize_pack.py \
30
+ --src /path/to/huge/dataset \
31
+ --dst /path/to/output/dataset \
32
+ --tokenizer meta-llama/Llama-2-7b-hf \
33
+ --seq_len 4096 \
34
+ --workers 16 \
35
+ --backend ray
36
+ ```
37
+
38
+ **Use for:** 10M+ row datasets across multiple machines with Ray
39
+
40
+ **Key parameters:**
41
+ - `seq_len=4096`: Larger context for modern models
42
+ - `workers=16`: Can exceed local CPU count with Ray
43
+ - `backend=ray`: Distributed computing across cluster
44
+
45
+ ---
46
+
47
+ ## Configuration 3: Debug Mode (Small Test Run)
48
+
49
+ ```python
50
+ python example_tokenize_pack.py \
51
+ --src /path/to/dataset \
52
+ --dst /path/to/output/test \
53
+ --tokenizer gpt2 \
54
+ --seq_len 2048 \
55
+ --workers 1 \
56
+ --debug
57
+ ```
58
+
59
+ **Use for:** Testing the pipeline before full run
60
+
61
+ **Key parameters:**
62
+ - `--debug`: Process only first 1000 rows
63
+ - `workers=1`: Single worker for easier debugging
64
+ - No output size limit for validation
65
+
66
+ ---
67
+
68
+ ## Configuration 4: Memory-Constrained Environment
69
+
70
+ ```python
71
+ python example_tokenize_pack.py \
72
+ --src /path/to/dataset \
73
+ --dst /path/to/output \
74
+ --tokenizer gpt2 \
75
+ --seq_len 512 \
76
+ --workers 2 \
77
+ --backend mp
78
+ ```
79
+
80
+ **Use for:** Machines with limited RAM
81
+
82
+ **Key parameters:**
83
+ - `seq_len=512`: Smaller sequences use less memory per worker
84
+ - `workers=2`: Fewer workers = less memory overhead
85
+ - Each worker uses ~1-2GB typically
86
+
87
+ ---
88
+
89
+ ## Configuration 5: Fast Processing (High Throughput)
90
+
91
+ ```python
92
+ python example_tokenize_pack.py \
93
+ --src /path/to/dataset \
94
+ --dst /path/to/output \
95
+ --tokenizer gpt2 \
96
+ --seq_len 2048 \
97
+ --workers $(nproc) \
98
+ --backend mp
99
+ ```
100
+
101
+ **Use for:** Maximum throughput, CPU cores fully saturated
102
+
103
+ **Key parameters:**
104
+ - `workers=$(nproc)`: Match exact CPU count
105
+ - `backend=mp`: Multiprocessing for CPU optimization
106
+ - Expect 1000-10000 sequences/sec depending on tokenizer
107
+
108
+ ---
109
+
110
+ ## Worker Count Guidelines
111
+
112
+ | Scenario | Workers | Backend |
113
+ |----------|---------|---------|
114
+ | **Laptop (4 cores)** | 2-3 | mp |
115
+ | **Workstation (16 cores)** | 12-15 | mp |
116
+ | **Server (32 cores)** | 28-31 | mp |
117
+ | **Large cluster** | 100+ | ray |
118
+
119
+ **Rule of thumb:** `workers = cpu_count - 1` for good performance
120
+
121
+ ---
122
+
123
+ ## Sequence Length Recommendations
124
+
125
+ | Use Case | Seq Length | Memory/Worker |
126
+ |----------|------------|---------------|
127
+ | **Short text (summaries)** | 512 | ~400MB |
128
+ | **Medium text (articles)** | 2048 | ~800MB |
129
+ | **Long context (code)** | 4096 | ~1.5GB |
130
+ | **Very long (books)** | 8192 | ~3GB |
131
+
132
+ ---
133
+
134
+ ## Performance Tuning
135
+
136
+ ### If you run out of memory:
137
+ 1. Reduce `seq_len` by 50%
138
+ 2. Reduce `workers` by 50%
139
+ 3. Add swap space (fallback, slower)
140
+
141
+ ### If processing is too slow:
142
+ 1. Increase `workers` (up to CPU count)
143
+ 2. Check if I/O is bottleneck (`--debug` for quick check)
144
+ 3. Use `--backend ray` for distributed processing
145
+
146
+ ### If some shards fail:
147
+ 1. Check temp directory for partial files
148
+ 2. Reduce `workers` (less memory pressure)
149
+ 3. Add error logging (already implemented)
150
+
151
+ ---
152
+
153
+ ## Platform-Specific Notes
154
+
155
+ ### Linux/Mac
156
+ ```bash
157
+ # Full parallelism
158
+ --workers $(nproc) --backend mp
159
+
160
+ # With Ray cluster
161
+ --workers 64 --backend ray
162
+ ```
163
+
164
+ ### Windows
165
+ ```bash
166
+ # Use fewer workers (GIL limitations)
167
+ --workers 2-4 --backend mp
168
+
169
+ # Consider WSL2 + Linux for better parallelism
170
+ ```
171
+
172
+ ### Docker/Container
173
+ ```bash
174
+ # Respect container limits
175
+ --workers 2-4 # Even if host has more cores
176
+
177
+ # Set memory limits safely
178
+ --seq_len 1024 # Conservative
179
+ ```
180
+
181
+ ---
182
+
183
+ ## Monitoring Metrics
184
+
185
+ Track these during processing:
186
+
187
+ ```
188
+ Time per shard: <time_per_shard>
189
+ Tokens/second: len(output) * seq_len / total_time
190
+ Memory/worker: Watch via `top` or Docker stats
191
+ Failed shards: Count of None results
192
+ ```
193
+
194
+ **Healthy run:**
195
+ - ✅ Tokens/sec > 1000
196
+ - ✅ Memory < 80% of available
197
+ - ✅ Failed shards = 0
198
+ - ✅ Total time = dataset_size / throughput
199
+
200
+ **Concerning run:**
201
+ - ⚠️ Tokens/sec < 100
202
+ - ⚠️ Memory > 90%
203
+ - ⚠️ Failed shards > 0
204
+ - ⚠️ Very uneven shard times
205
+
206
+ ---
207
+
208
+ ## Common Errors & Fixes
209
+
210
+ | Error | Cause | Fix |
211
+ |-------|-------|-----|
212
+ | `OOM Killed` | Too large seq_len or workers | Reduce both by 50% |
213
+ | `Pickle error` | Large object in args | Pass paths, not objects |
214
+ | `Timeout` | Shard too large | Increase workers or reduce seq_len |
215
+ | `Permission denied` | Temp directory | Use `sudo rm -rf ...` or `--debug` |
216
+ | `Empty result` | All examples filtered | Check transform logic |
217
+