indovse 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indovse-0.1.0/PKG-INFO +54 -0
- indovse-0.1.0/README.md +31 -0
- indovse-0.1.0/indovse/__init__.py +3 -0
- indovse-0.1.0/indovse/pipeline.py +164 -0
- indovse-0.1.0/indovse/utils.py +19 -0
- indovse-0.1.0/indovse.egg-info/PKG-INFO +54 -0
- indovse-0.1.0/indovse.egg-info/SOURCES.txt +10 -0
- indovse-0.1.0/indovse.egg-info/dependency_links.txt +1 -0
- indovse-0.1.0/indovse.egg-info/requires.txt +9 -0
- indovse-0.1.0/indovse.egg-info/top_level.txt +1 -0
- indovse-0.1.0/setup.cfg +4 -0
- indovse-0.1.0/setup.py +19 -0
indovse-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: indovse
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Indonesian Video Salient Entity Detection
|
|
5
|
+
Author: galihkjaya
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: openai-whisper
|
|
9
|
+
Requires-Dist: transformers>=4.40
|
|
10
|
+
Requires-Dist: torch>=2.0
|
|
11
|
+
Requires-Dist: scikit-learn>=1.4
|
|
12
|
+
Requires-Dist: joblib
|
|
13
|
+
Requires-Dist: huggingface_hub
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: pandas
|
|
16
|
+
Requires-Dist: ffmpeg-python
|
|
17
|
+
Dynamic: author
|
|
18
|
+
Dynamic: description
|
|
19
|
+
Dynamic: description-content-type
|
|
20
|
+
Dynamic: requires-dist
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# IndoVSE
|
|
25
|
+
|
|
26
|
+
IndoVSE is a Python package for extracting salient named entities from Indonesian videos.
|
|
27
|
+
|
|
28
|
+
## Requirements
|
|
29
|
+
|
|
30
|
+
You must have `ffmpeg` installed on your system (not via pip).
|
|
31
|
+
|
|
32
|
+
- **Ubuntu/Debian:** `sudo apt install ffmpeg`
|
|
33
|
+
- **MacOS:** `brew install ffmpeg`
|
|
34
|
+
- **Windows:** Download from official ffmpeg website and add it to your system PATH.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install indovse
|
|
40
|
+
```
|
|
41
|
+
*(atau instal dari source: `pip install -e .` di dalam folder ini)*
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from indovse import predict
|
|
47
|
+
|
|
48
|
+
# Models are automatically downloaded on first call and cached in memory
|
|
49
|
+
# No GPU required, runs on CPU
|
|
50
|
+
result = predict("video.mp4", top_k=5)
|
|
51
|
+
|
|
52
|
+
# Output is a dict with 'salient_entities' and 'entity_timeline'
|
|
53
|
+
print(result["salient_entities"])
|
|
54
|
+
```
|
indovse-0.1.0/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# IndoVSE
|
|
2
|
+
|
|
3
|
+
IndoVSE is a Python package for extracting salient named entities from Indonesian videos.
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
You must have `ffmpeg` installed on your system (not via pip).
|
|
8
|
+
|
|
9
|
+
- **Ubuntu/Debian:** `sudo apt install ffmpeg`
|
|
10
|
+
- **MacOS:** `brew install ffmpeg`
|
|
11
|
+
- **Windows:** Download from official ffmpeg website and add it to your system PATH.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install indovse
|
|
17
|
+
```
|
|
18
|
+
*(atau instal dari source: `pip install -e .` di dalam folder ini)*
|
|
19
|
+
|
|
20
|
+
## Usage
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from indovse import predict
|
|
24
|
+
|
|
25
|
+
# Models are automatically downloaded on first call and cached in memory
|
|
26
|
+
# No GPU required, runs on CPU
|
|
27
|
+
result = predict("video.mp4", top_k=5)
|
|
28
|
+
|
|
29
|
+
# Output is a dict with 'salient_entities' and 'entity_timeline'
|
|
30
|
+
print(result["salient_entities"])
|
|
31
|
+
```
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import numpy as np
|
|
4
|
+
import torch
|
|
5
|
+
import joblib
|
|
6
|
+
import ffmpeg
|
|
7
|
+
import whisper
|
|
8
|
+
from transformers import pipeline as hf_pipeline, AutoModel, AutoTokenizer
|
|
9
|
+
from huggingface_hub import hf_hub_download
|
|
10
|
+
from collections import Counter
|
|
11
|
+
from .utils import calculate_temporal_features
|
|
12
|
+
|
|
13
|
+
# Global cache for lazy loading
|
|
14
|
+
_models = {
|
|
15
|
+
"whisper": None,
|
|
16
|
+
"ner": None,
|
|
17
|
+
"indobert_model": None,
|
|
18
|
+
"indobert_tokenizer": None,
|
|
19
|
+
"scaler": None,
|
|
20
|
+
"classifier": None
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
def _load_models():
|
|
24
|
+
if _models["whisper"] is None:
|
|
25
|
+
_models["whisper"] = whisper.load_model("medium")
|
|
26
|
+
if _models["ner"] is None:
|
|
27
|
+
_models["ner"] = hf_pipeline("ner", model="cahya/bert-base-indonesian-NER", aggregation_strategy="simple")
|
|
28
|
+
if _models["indobert_model"] is None:
|
|
29
|
+
_models["indobert_model"] = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
|
|
30
|
+
if _models["indobert_tokenizer"] is None:
|
|
31
|
+
_models["indobert_tokenizer"] = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
|
|
32
|
+
if _models["scaler"] is None:
|
|
33
|
+
scaler_path = hf_hub_download("galihkjaya/IndoVSE", "scaler.pkl")
|
|
34
|
+
_models["scaler"] = joblib.load(scaler_path)
|
|
35
|
+
if _models["classifier"] is None:
|
|
36
|
+
clf_path = hf_hub_download("galihkjaya/IndoVSE", "model.pkl")
|
|
37
|
+
_models["classifier"] = joblib.load(clf_path)
|
|
38
|
+
|
|
39
|
+
def extract_audio(video_path):
|
|
40
|
+
tmp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
|
|
41
|
+
try:
|
|
42
|
+
(
|
|
43
|
+
ffmpeg
|
|
44
|
+
.input(video_path)
|
|
45
|
+
.output(tmp_audio, acodec='pcm_s16le', ac=1, ar='16k', loglevel='quiet')
|
|
46
|
+
.overwrite_output()
|
|
47
|
+
.run(capture_stdout=True, capture_stderr=True)
|
|
48
|
+
)
|
|
49
|
+
return tmp_audio
|
|
50
|
+
except ffmpeg.Error as e:
|
|
51
|
+
if os.path.exists(tmp_audio):
|
|
52
|
+
os.remove(tmp_audio)
|
|
53
|
+
raise RuntimeError(f"FFmpeg error: {e.stderr.decode()}")
|
|
54
|
+
|
|
55
|
+
def predict(video_path, top_k=5):
|
|
56
|
+
_load_models()
|
|
57
|
+
|
|
58
|
+
# 1. Extract audio
|
|
59
|
+
audio_path = extract_audio(video_path)
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
# 2. Transcribe
|
|
63
|
+
whisper_model = _models["whisper"]
|
|
64
|
+
result = whisper_model.transcribe(audio_path, language="id", word_timestamps=True)
|
|
65
|
+
|
|
66
|
+
segments = result.get('segments', [])
|
|
67
|
+
duration = segments[-1]['end'] if segments else 0.0
|
|
68
|
+
|
|
69
|
+
if duration == 0.0:
|
|
70
|
+
return {"salient_entities": [], "entity_timeline": {}}
|
|
71
|
+
|
|
72
|
+
# 3. NER & Features extraction
|
|
73
|
+
ner_model = _models["ner"]
|
|
74
|
+
tokenizer = _models["indobert_tokenizer"]
|
|
75
|
+
bert_model = _models["indobert_model"]
|
|
76
|
+
|
|
77
|
+
entities_data = {}
|
|
78
|
+
total_entity_mentions = 0
|
|
79
|
+
|
|
80
|
+
for segment in segments:
|
|
81
|
+
seg_text = segment['text']
|
|
82
|
+
seg_start = segment['start']
|
|
83
|
+
|
|
84
|
+
ner_res = ner_model(seg_text)
|
|
85
|
+
if not ner_res:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
# CLS embedding for the segment
|
|
89
|
+
inputs = tokenizer(seg_text, return_tensors="pt", truncation=True, max_length=512)
|
|
90
|
+
with torch.no_grad():
|
|
91
|
+
outputs = bert_model(**inputs)
|
|
92
|
+
cls_emb = outputs.last_hidden_state[0, 0, :].numpy()
|
|
93
|
+
|
|
94
|
+
for ent in ner_res:
|
|
95
|
+
ent_text = ent['word'].strip()
|
|
96
|
+
if not ent_text:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
ent_label = ent['entity_group']
|
|
100
|
+
|
|
101
|
+
if ent_text not in entities_data:
|
|
102
|
+
entities_data[ent_text] = {"labels": [], "timestamps": [], "embeddings": []}
|
|
103
|
+
|
|
104
|
+
entities_data[ent_text]["labels"].append(ent_label)
|
|
105
|
+
entities_data[ent_text]["timestamps"].append(seg_start)
|
|
106
|
+
entities_data[ent_text]["embeddings"].append(cls_emb)
|
|
107
|
+
total_entity_mentions += 1
|
|
108
|
+
|
|
109
|
+
if total_entity_mentions == 0:
|
|
110
|
+
return {"salient_entities": [], "entity_timeline": {}}
|
|
111
|
+
|
|
112
|
+
results = []
|
|
113
|
+
entity_timeline = {}
|
|
114
|
+
|
|
115
|
+
for ent_text, data in entities_data.items():
|
|
116
|
+
timestamps = sorted(data["timestamps"])
|
|
117
|
+
entity_timeline[ent_text] = timestamps
|
|
118
|
+
|
|
119
|
+
freq_norm, burstiness, first_appear, coverage = calculate_temporal_features(
|
|
120
|
+
timestamps, duration, total_entity_mentions
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
avg_emb = np.mean(data["embeddings"], axis=0)
|
|
124
|
+
|
|
125
|
+
# Combine temporal features and CLS embedding
|
|
126
|
+
feature_vector = np.concatenate([[freq_norm, burstiness, first_appear, coverage], avg_emb])
|
|
127
|
+
|
|
128
|
+
# Majority vote for label
|
|
129
|
+
majority_label = Counter(data["labels"]).most_common(1)[0][0]
|
|
130
|
+
|
|
131
|
+
results.append({
|
|
132
|
+
"entity_text": ent_text,
|
|
133
|
+
"entity_label": majority_label,
|
|
134
|
+
"freq_norm": float(freq_norm),
|
|
135
|
+
"first_appear": float(first_appear),
|
|
136
|
+
"coverage": float(coverage),
|
|
137
|
+
"burstiness": float(burstiness),
|
|
138
|
+
"feature_vector": feature_vector
|
|
139
|
+
})
|
|
140
|
+
|
|
141
|
+
# 4. Predict salience
|
|
142
|
+
X = np.array([r["feature_vector"] for r in results])
|
|
143
|
+
scaler = _models["scaler"]
|
|
144
|
+
clf = _models["classifier"]
|
|
145
|
+
|
|
146
|
+
X_scaled = scaler.transform(X)
|
|
147
|
+
probs = clf.predict_proba(X_scaled)[:, 1]
|
|
148
|
+
|
|
149
|
+
for r, p in zip(results, probs):
|
|
150
|
+
r["salient_prob"] = float(p)
|
|
151
|
+
del r["feature_vector"]
|
|
152
|
+
|
|
153
|
+
# 5. Filter & Sort
|
|
154
|
+
salient_entities = [r for r in results if r["salient_prob"] > 0.1]
|
|
155
|
+
salient_entities.sort(key=lambda x: x["salient_prob"], reverse=True)
|
|
156
|
+
salient_entities = salient_entities[:top_k]
|
|
157
|
+
|
|
158
|
+
return {
|
|
159
|
+
"salient_entities": salient_entities,
|
|
160
|
+
"entity_timeline": entity_timeline
|
|
161
|
+
}
|
|
162
|
+
finally:
|
|
163
|
+
if os.path.exists(audio_path):
|
|
164
|
+
os.remove(audio_path)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
def calculate_temporal_features(timestamps, duration, total_mentions):
|
|
4
|
+
timestamps = sorted(timestamps)
|
|
5
|
+
freq = len(timestamps)
|
|
6
|
+
|
|
7
|
+
freq_norm = freq / total_mentions if total_mentions > 0 else 0.0
|
|
8
|
+
first_appear = timestamps[0] / duration if duration > 0 else 0.0
|
|
9
|
+
coverage = (timestamps[-1] - timestamps[0]) / duration if duration > 0 else 0.0
|
|
10
|
+
|
|
11
|
+
if freq > 1:
|
|
12
|
+
inter_arrival = np.diff(timestamps)
|
|
13
|
+
mu = np.mean(inter_arrival)
|
|
14
|
+
sigma = np.std(inter_arrival)
|
|
15
|
+
burstiness = (sigma - mu) / (sigma + mu) if (sigma + mu) > 0 else 0.0
|
|
16
|
+
else:
|
|
17
|
+
burstiness = 0.0
|
|
18
|
+
|
|
19
|
+
return freq_norm, burstiness, first_appear, coverage
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: indovse
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Indonesian Video Salient Entity Detection
|
|
5
|
+
Author: galihkjaya
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: openai-whisper
|
|
9
|
+
Requires-Dist: transformers>=4.40
|
|
10
|
+
Requires-Dist: torch>=2.0
|
|
11
|
+
Requires-Dist: scikit-learn>=1.4
|
|
12
|
+
Requires-Dist: joblib
|
|
13
|
+
Requires-Dist: huggingface_hub
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: pandas
|
|
16
|
+
Requires-Dist: ffmpeg-python
|
|
17
|
+
Dynamic: author
|
|
18
|
+
Dynamic: description
|
|
19
|
+
Dynamic: description-content-type
|
|
20
|
+
Dynamic: requires-dist
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# IndoVSE
|
|
25
|
+
|
|
26
|
+
IndoVSE is a Python package for extracting salient named entities from Indonesian videos.
|
|
27
|
+
|
|
28
|
+
## Requirements
|
|
29
|
+
|
|
30
|
+
You must have `ffmpeg` installed on your system (not via pip).
|
|
31
|
+
|
|
32
|
+
- **Ubuntu/Debian:** `sudo apt install ffmpeg`
|
|
33
|
+
- **MacOS:** `brew install ffmpeg`
|
|
34
|
+
- **Windows:** Download from official ffmpeg website and add it to your system PATH.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install indovse
|
|
40
|
+
```
|
|
41
|
+
*(atau instal dari source: `pip install -e .` di dalam folder ini)*
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from indovse import predict
|
|
47
|
+
|
|
48
|
+
# Models are automatically downloaded on first call and cached in memory
|
|
49
|
+
# No GPU required, runs on CPU
|
|
50
|
+
result = predict("video.mp4", top_k=5)
|
|
51
|
+
|
|
52
|
+
# Output is a dict with 'salient_entities' and 'entity_timeline'
|
|
53
|
+
print(result["salient_entities"])
|
|
54
|
+
```
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
indovse
|
indovse-0.1.0/setup.cfg
ADDED
indovse-0.1.0/setup.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
|
|
6
|
+
with open("requirements.txt", "r", encoding="utf-8") as f:
|
|
7
|
+
requirements = [line.strip() for line in f if line.strip()]
|
|
8
|
+
|
|
9
|
+
setup(
|
|
10
|
+
name="indovse",
|
|
11
|
+
version="0.1.0",
|
|
12
|
+
author="galihkjaya",
|
|
13
|
+
description="Indonesian Video Salient Entity Detection",
|
|
14
|
+
long_description=long_description,
|
|
15
|
+
long_description_content_type="text/markdown",
|
|
16
|
+
packages=find_packages(),
|
|
17
|
+
install_requires=requirements,
|
|
18
|
+
python_requires=">=3.9",
|
|
19
|
+
)
|