indovse 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
indovse-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: indovse
3
+ Version: 0.1.0
4
+ Summary: Indonesian Video Salient Entity Detection
5
+ Author: galihkjaya
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: openai-whisper
9
+ Requires-Dist: transformers>=4.40
10
+ Requires-Dist: torch>=2.0
11
+ Requires-Dist: scikit-learn>=1.4
12
+ Requires-Dist: joblib
13
+ Requires-Dist: huggingface_hub
14
+ Requires-Dist: numpy
15
+ Requires-Dist: pandas
16
+ Requires-Dist: ffmpeg-python
17
+ Dynamic: author
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: requires-dist
21
+ Dynamic: requires-python
22
+ Dynamic: summary
23
+
24
+ # IndoVSE
25
+
26
+ IndoVSE is a Python package for extracting salient named entities from Indonesian videos.
27
+
28
+ ## Requirements
29
+
30
+ You must have `ffmpeg` installed on your system (not via pip).
31
+
32
+ - **Ubuntu/Debian:** `sudo apt install ffmpeg`
33
+ - **MacOS:** `brew install ffmpeg`
34
+ - **Windows:** Download from official ffmpeg website and add it to your system PATH.
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install indovse
40
+ ```
41
+ *(atau instal dari source: `pip install -e .` di dalam folder ini)*
42
+
43
+ ## Usage
44
+
45
+ ```python
46
+ from indovse import predict
47
+
48
+ # Models are automatically downloaded on first call and cached in memory
49
+ # No GPU required, runs on CPU
50
+ result = predict("video.mp4", top_k=5)
51
+
52
+ # Output is a dict with 'salient_entities' and 'entity_timeline'
53
+ print(result["salient_entities"])
54
+ ```
@@ -0,0 +1,31 @@
1
+ # IndoVSE
2
+
3
+ IndoVSE is a Python package for extracting salient named entities from Indonesian videos.
4
+
5
+ ## Requirements
6
+
7
+ You must have `ffmpeg` installed on your system (not via pip).
8
+
9
+ - **Ubuntu/Debian:** `sudo apt install ffmpeg`
10
+ - **MacOS:** `brew install ffmpeg`
11
+ - **Windows:** Download from official ffmpeg website and add it to your system PATH.
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ pip install indovse
17
+ ```
18
+ *(atau instal dari source: `pip install -e .` di dalam folder ini)*
19
+
20
+ ## Usage
21
+
22
+ ```python
23
+ from indovse import predict
24
+
25
+ # Models are automatically downloaded on first call and cached in memory
26
+ # No GPU required, runs on CPU
27
+ result = predict("video.mp4", top_k=5)
28
+
29
+ # Output is a dict with 'salient_entities' and 'entity_timeline'
30
+ print(result["salient_entities"])
31
+ ```
@@ -0,0 +1,3 @@
1
+ from .pipeline import predict
2
+
3
+ __all__ = ["predict"]
@@ -0,0 +1,164 @@
1
+ import os
2
+ import tempfile
3
+ import numpy as np
4
+ import torch
5
+ import joblib
6
+ import ffmpeg
7
+ import whisper
8
+ from transformers import pipeline as hf_pipeline, AutoModel, AutoTokenizer
9
+ from huggingface_hub import hf_hub_download
10
+ from collections import Counter
11
+ from .utils import calculate_temporal_features
12
+
13
+ # Global cache for lazy loading
14
+ _models = {
15
+ "whisper": None,
16
+ "ner": None,
17
+ "indobert_model": None,
18
+ "indobert_tokenizer": None,
19
+ "scaler": None,
20
+ "classifier": None
21
+ }
22
+
23
+ def _load_models():
24
+ if _models["whisper"] is None:
25
+ _models["whisper"] = whisper.load_model("medium")
26
+ if _models["ner"] is None:
27
+ _models["ner"] = hf_pipeline("ner", model="cahya/bert-base-indonesian-NER", aggregation_strategy="simple")
28
+ if _models["indobert_model"] is None:
29
+ _models["indobert_model"] = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
30
+ if _models["indobert_tokenizer"] is None:
31
+ _models["indobert_tokenizer"] = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
32
+ if _models["scaler"] is None:
33
+ scaler_path = hf_hub_download("galihkjaya/IndoVSE", "scaler.pkl")
34
+ _models["scaler"] = joblib.load(scaler_path)
35
+ if _models["classifier"] is None:
36
+ clf_path = hf_hub_download("galihkjaya/IndoVSE", "model.pkl")
37
+ _models["classifier"] = joblib.load(clf_path)
38
+
39
+ def extract_audio(video_path):
40
+ tmp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
41
+ try:
42
+ (
43
+ ffmpeg
44
+ .input(video_path)
45
+ .output(tmp_audio, acodec='pcm_s16le', ac=1, ar='16k', loglevel='quiet')
46
+ .overwrite_output()
47
+ .run(capture_stdout=True, capture_stderr=True)
48
+ )
49
+ return tmp_audio
50
+ except ffmpeg.Error as e:
51
+ if os.path.exists(tmp_audio):
52
+ os.remove(tmp_audio)
53
+ raise RuntimeError(f"FFmpeg error: {e.stderr.decode()}")
54
+
55
+ def predict(video_path, top_k=5):
56
+ _load_models()
57
+
58
+ # 1. Extract audio
59
+ audio_path = extract_audio(video_path)
60
+
61
+ try:
62
+ # 2. Transcribe
63
+ whisper_model = _models["whisper"]
64
+ result = whisper_model.transcribe(audio_path, language="id", word_timestamps=True)
65
+
66
+ segments = result.get('segments', [])
67
+ duration = segments[-1]['end'] if segments else 0.0
68
+
69
+ if duration == 0.0:
70
+ return {"salient_entities": [], "entity_timeline": {}}
71
+
72
+ # 3. NER & Features extraction
73
+ ner_model = _models["ner"]
74
+ tokenizer = _models["indobert_tokenizer"]
75
+ bert_model = _models["indobert_model"]
76
+
77
+ entities_data = {}
78
+ total_entity_mentions = 0
79
+
80
+ for segment in segments:
81
+ seg_text = segment['text']
82
+ seg_start = segment['start']
83
+
84
+ ner_res = ner_model(seg_text)
85
+ if not ner_res:
86
+ continue
87
+
88
+ # CLS embedding for the segment
89
+ inputs = tokenizer(seg_text, return_tensors="pt", truncation=True, max_length=512)
90
+ with torch.no_grad():
91
+ outputs = bert_model(**inputs)
92
+ cls_emb = outputs.last_hidden_state[0, 0, :].numpy()
93
+
94
+ for ent in ner_res:
95
+ ent_text = ent['word'].strip()
96
+ if not ent_text:
97
+ continue
98
+
99
+ ent_label = ent['entity_group']
100
+
101
+ if ent_text not in entities_data:
102
+ entities_data[ent_text] = {"labels": [], "timestamps": [], "embeddings": []}
103
+
104
+ entities_data[ent_text]["labels"].append(ent_label)
105
+ entities_data[ent_text]["timestamps"].append(seg_start)
106
+ entities_data[ent_text]["embeddings"].append(cls_emb)
107
+ total_entity_mentions += 1
108
+
109
+ if total_entity_mentions == 0:
110
+ return {"salient_entities": [], "entity_timeline": {}}
111
+
112
+ results = []
113
+ entity_timeline = {}
114
+
115
+ for ent_text, data in entities_data.items():
116
+ timestamps = sorted(data["timestamps"])
117
+ entity_timeline[ent_text] = timestamps
118
+
119
+ freq_norm, burstiness, first_appear, coverage = calculate_temporal_features(
120
+ timestamps, duration, total_entity_mentions
121
+ )
122
+
123
+ avg_emb = np.mean(data["embeddings"], axis=0)
124
+
125
+ # Combine temporal features and CLS embedding
126
+ feature_vector = np.concatenate([[freq_norm, burstiness, first_appear, coverage], avg_emb])
127
+
128
+ # Majority vote for label
129
+ majority_label = Counter(data["labels"]).most_common(1)[0][0]
130
+
131
+ results.append({
132
+ "entity_text": ent_text,
133
+ "entity_label": majority_label,
134
+ "freq_norm": float(freq_norm),
135
+ "first_appear": float(first_appear),
136
+ "coverage": float(coverage),
137
+ "burstiness": float(burstiness),
138
+ "feature_vector": feature_vector
139
+ })
140
+
141
+ # 4. Predict salience
142
+ X = np.array([r["feature_vector"] for r in results])
143
+ scaler = _models["scaler"]
144
+ clf = _models["classifier"]
145
+
146
+ X_scaled = scaler.transform(X)
147
+ probs = clf.predict_proba(X_scaled)[:, 1]
148
+
149
+ for r, p in zip(results, probs):
150
+ r["salient_prob"] = float(p)
151
+ del r["feature_vector"]
152
+
153
+ # 5. Filter & Sort
154
+ salient_entities = [r for r in results if r["salient_prob"] > 0.1]
155
+ salient_entities.sort(key=lambda x: x["salient_prob"], reverse=True)
156
+ salient_entities = salient_entities[:top_k]
157
+
158
+ return {
159
+ "salient_entities": salient_entities,
160
+ "entity_timeline": entity_timeline
161
+ }
162
+ finally:
163
+ if os.path.exists(audio_path):
164
+ os.remove(audio_path)
@@ -0,0 +1,19 @@
1
+ import numpy as np
2
+
3
+ def calculate_temporal_features(timestamps, duration, total_mentions):
4
+ timestamps = sorted(timestamps)
5
+ freq = len(timestamps)
6
+
7
+ freq_norm = freq / total_mentions if total_mentions > 0 else 0.0
8
+ first_appear = timestamps[0] / duration if duration > 0 else 0.0
9
+ coverage = (timestamps[-1] - timestamps[0]) / duration if duration > 0 else 0.0
10
+
11
+ if freq > 1:
12
+ inter_arrival = np.diff(timestamps)
13
+ mu = np.mean(inter_arrival)
14
+ sigma = np.std(inter_arrival)
15
+ burstiness = (sigma - mu) / (sigma + mu) if (sigma + mu) > 0 else 0.0
16
+ else:
17
+ burstiness = 0.0
18
+
19
+ return freq_norm, burstiness, first_appear, coverage
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: indovse
3
+ Version: 0.1.0
4
+ Summary: Indonesian Video Salient Entity Detection
5
+ Author: galihkjaya
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: openai-whisper
9
+ Requires-Dist: transformers>=4.40
10
+ Requires-Dist: torch>=2.0
11
+ Requires-Dist: scikit-learn>=1.4
12
+ Requires-Dist: joblib
13
+ Requires-Dist: huggingface_hub
14
+ Requires-Dist: numpy
15
+ Requires-Dist: pandas
16
+ Requires-Dist: ffmpeg-python
17
+ Dynamic: author
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: requires-dist
21
+ Dynamic: requires-python
22
+ Dynamic: summary
23
+
24
+ # IndoVSE
25
+
26
+ IndoVSE is a Python package for extracting salient named entities from Indonesian videos.
27
+
28
+ ## Requirements
29
+
30
+ You must have `ffmpeg` installed on your system (not via pip).
31
+
32
+ - **Ubuntu/Debian:** `sudo apt install ffmpeg`
33
+ - **MacOS:** `brew install ffmpeg`
34
+ - **Windows:** Download from official ffmpeg website and add it to your system PATH.
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install indovse
40
+ ```
41
+ *(atau instal dari source: `pip install -e .` di dalam folder ini)*
42
+
43
+ ## Usage
44
+
45
+ ```python
46
+ from indovse import predict
47
+
48
+ # Models are automatically downloaded on first call and cached in memory
49
+ # No GPU required, runs on CPU
50
+ result = predict("video.mp4", top_k=5)
51
+
52
+ # Output is a dict with 'salient_entities' and 'entity_timeline'
53
+ print(result["salient_entities"])
54
+ ```
@@ -0,0 +1,10 @@
1
+ README.md
2
+ setup.py
3
+ indovse/__init__.py
4
+ indovse/pipeline.py
5
+ indovse/utils.py
6
+ indovse.egg-info/PKG-INFO
7
+ indovse.egg-info/SOURCES.txt
8
+ indovse.egg-info/dependency_links.txt
9
+ indovse.egg-info/requires.txt
10
+ indovse.egg-info/top_level.txt
@@ -0,0 +1,9 @@
1
+ openai-whisper
2
+ transformers>=4.40
3
+ torch>=2.0
4
+ scikit-learn>=1.4
5
+ joblib
6
+ huggingface_hub
7
+ numpy
8
+ pandas
9
+ ffmpeg-python
@@ -0,0 +1 @@
1
+ indovse
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
indovse-0.1.0/setup.py ADDED
@@ -0,0 +1,19 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+ with open("requirements.txt", "r", encoding="utf-8") as f:
7
+ requirements = [line.strip() for line in f if line.strip()]
8
+
9
+ setup(
10
+ name="indovse",
11
+ version="0.1.0",
12
+ author="galihkjaya",
13
+ description="Indonesian Video Salient Entity Detection",
14
+ long_description=long_description,
15
+ long_description_content_type="text/markdown",
16
+ packages=find_packages(),
17
+ install_requires=requirements,
18
+ python_requires=">=3.9",
19
+ )