neural-extractor-node 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,134 @@
1
+ # /// script
2
+ # # STRICT LOCK: Capped at 3.12. Python 3.13+ (PEP 667) breaks Whisper's setup.py read_version() via locals() KeyError.
3
+ # requires-python = ">=3.11, <=3.12"
4
+ # dependencies = [
5
+ # "openai-whisper==20240930",
6
+ # "torch==2.12.0",
7
+ # # Whisper doesn't actually need torchaudio, because torchaudio==2.12.0 wasn't released, we're just dropping it out.
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # # HARDWARE BRIDGE: Routes the engine downloads to the CUDA 13.0 binaries (Blackwell sm_120 support)
12
+ # torch = { index = "pytorch-cu130" }
13
+ #
14
+ # [[tool.uv.index]]
15
+ # name = "pytorch-cu130"
16
+ # url = "https://download.pytorch.org/whl/cu130"
17
+ # explicit = true
18
+ #
19
+ # [tool.uv.extra-build-dependencies]
20
+ # # STRICT LOCK: OpenAI's setup.py still relies on pkg_resources, requiring legacy setuptools to build
21
+ # openai-whisper = ["setuptools<81"]
22
+ # ///
23
+
24
+ import whisper
25
+ import glob
26
+ import os
27
+ import re
28
+ import torch
29
+
30
+
31
+ # -------- CONFIG & HARDWARE --------
32
+ MODEL_NAME = "medium" # "small" = faster, "medium" = more accurate
33
+ LANGUAGE = "ru" #"en"
34
+
35
+ # --- HYBRID PATH DISCOVERY ---
36
+ def get_data_dir():
37
+ # Priority 1: 'data' (Docker volume mount)
38
+ # Priority 2: '.' (Native: User is running from inside the neural_extractor_node folder)
39
+ # Priority 3: 'src/neural_extractor_node' (Native: User is running from the project root)
40
+ for candidate in ["data", ".", "src/neural_extractor_node"]:
41
+ if glob.glob(os.path.join(candidate, "*part*.wav")):
42
+ return candidate
43
+ return "." # Fallback if nothing is found
44
+
45
+ BASE_DIR = get_data_dir()
46
+
47
+ PATTERN = os.path.join(BASE_DIR, "*part*.wav")
48
+ OUTPUT_FILE = os.path.join(BASE_DIR, "full_transcript.txt")
49
+ # -----------------------------------
50
+
51
+
52
+ # Global Hardware Detection
53
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
54
+ # -----------------------------------
55
+
56
+ def extract_part_number(filename):
57
+ match = re.search(r'part(\d+)', filename)
58
+ return int(match.group(1)) if match else 0
59
+
60
+ def main():
61
+ print(f"\n[SYSTEM] Target Hardware: {DEVICE.upper()}")
62
+
63
+ if DEVICE == "cuda":
64
+ gpu_name = torch.cuda.get_device_name(0)
65
+ print(f"[SYSTEM] GPU Detected: {gpu_name}")
66
+
67
+ # Hardware optimization for RTX 30/40 series (Ampere/Ada architectures)
68
+ # Allows PyTorch to use TensorFloat32 for massive matrix math speedups
69
+ torch.backends.cuda.matmul.allow_tf32 = True
70
+
71
+ print(f"\nLoading '{MODEL_NAME}' model into VRAM...")
72
+ # Explicitly map model into GPU memory
73
+ model = whisper.load_model(MODEL_NAME, device=DEVICE)
74
+
75
+ files = glob.glob(PATTERN)
76
+ files = sorted(files, key=extract_part_number)
77
+
78
+ print("\nMatched files:")
79
+ for f in files:
80
+ print(f)
81
+
82
+ if not files:
83
+ print("\nNo files found. Check filename pattern.")
84
+ return
85
+
86
+ print(f"\nFound {len(files)} files\n")
87
+
88
+ with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
89
+ for i, file in enumerate(files, 1):
90
+ print(f"\n[{i}/{len(files)}] Processing: {file}")
91
+
92
+ try:
93
+ # --- INFERENCE EXECUTION ---
94
+ result = model.transcribe(
95
+ file,
96
+ language=LANGUAGE,
97
+ # Dynamic fp16 toggling:
98
+ # fp16=True (CUDA) drastically speeds up inference & cuts VRAM usage by 50%
99
+ # fp16=False (CPU) prevents float16 warnings/crashes when running on base CPU
100
+ fp16=(DEVICE == "cuda"),
101
+ verbose=True
102
+ )
103
+
104
+ # Always build from segments
105
+ text = " ".join(seg["text"].strip() for seg in result["segments"])
106
+
107
+ print(f"Finished: {file} | chars: {len(text)}")
108
+
109
+ out.write(f"\n--- {os.path.basename(file)} ---\n")
110
+ out.write(text + "\n")
111
+ out.flush()
112
+
113
+ except Exception as e:
114
+ print(f"Error processing {file}: {e}")
115
+ out.write(f"\n--- ERROR in {file} ---\n{e}\n")
116
+ out.flush()
117
+
118
+ print(f"\nDone. Transcript saved to: {OUTPUT_FILE}")
119
+
120
+ if __name__ == "__main__":
121
+ main()
122
+
123
+ # --- EXECUTION INSTRUCTIONS ---
124
+
125
+ # Native Bare Metal Ubuntu (Ensure NVIDIA Proprietary Drivers + CUDA Toolkit are installed)
126
+ # sudo apt update && sudo apt install -y ffmpeg
127
+ # curl -LsSf https://astral.sh/uv/install.sh | sh
128
+ # uv run python "$(pwd)/src/neural_extractor_node/transcribe.py"
129
+ # or inside "src/neural_extractor_node"
130
+ # uv run python "transcribe.py"
131
+
132
+ # Docker Execution (If running this inside the container built previously):
133
+ # MUST pass the GPU flag to Docker to expose the hardware bridge:
134
+ # docker run -it --gpus all -v "$(pwd)/src/neural_extractor_node:/app/data" neural-extractor-node-i
@@ -0,0 +1,7 @@
1
+ Metadata-Version: 2.4
2
+ Name: neural-extractor-node
3
+ Version: 0.1.0
4
+ Summary: Deterministic Whisper Pipeline
5
+ Requires-Python: <=3.12,>=3.11
6
+ Requires-Dist: openai-whisper==20240930
7
+ Requires-Dist: torch==2.12.0
@@ -0,0 +1,6 @@
1
+ neural_extractor_node/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
2
+ neural_extractor_node/transcribe.py,sha256=w0Yu5BJd3Hsn-v4EcK2x7T2KHCKiGqDO2een2jZKuHo,4680
3
+ neural_extractor_node-0.1.0.dist-info/METADATA,sha256=G_7DMqrFb9d2Wop7Pqtcfd-bcvEFYp3qZItGYSnd7Io,205
4
+ neural_extractor_node-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
5
+ neural_extractor_node-0.1.0.dist-info/top_level.txt,sha256=xKSWCynWxp-nrCnmmuKcz9QF6vEIMk3sYGNPWe9kKVA,22
6
+ neural_extractor_node-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ neural_extractor_node