react-native-sherpa-onnx 0.3.5 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -0
- package/README.md +90 -21
- package/SherpaOnnx.podspec +3 -0
- package/THIRD_PARTY_LICENSES/README.md +62 -0
- package/THIRD_PARTY_LICENSES/ffmpeg.txt +502 -0
- package/THIRD_PARTY_LICENSES/libarchive.txt +65 -0
- package/THIRD_PARTY_LICENSES/nvidia_omla.txt +181 -0
- package/THIRD_PARTY_LICENSES/onnxruntime.txt +21 -0
- package/THIRD_PARTY_LICENSES/opus.txt +44 -0
- package/THIRD_PARTY_LICENSES/sherpa-onnx.txt +201 -0
- package/THIRD_PARTY_LICENSES/shine.txt +482 -0
- package/THIRD_PARTY_LICENSES/zstd.txt +30 -0
- package/android/build.gradle +7 -3
- package/android/prebuilt-download.gradle +345 -153
- package/android/prebuilt-versions.gradle +2 -2
- package/android/src/main/assets/model_licenses/asr-models-license-status.csv +409 -0
- package/android/src/main/assets/model_licenses/qnn-asr-models-license-status.csv +695 -0
- package/android/src/main/assets/model_licenses/tts-models-license-status.csv +596 -0
- package/android/src/main/cpp/CMakeLists.txt +28 -10
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.cpp +306 -6
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-helper.h +33 -4
- package/android/src/main/cpp/jni/archive/sherpa-onnx-archive-jni.cpp +266 -7
- package/android/src/main/cpp/jni/audio/sherpa-onnx-audio-convert-jni.cpp +268 -2
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-tts.cpp +6 -2
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-tts.cpp +4 -2
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxArchiveHelper.kt +137 -7
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxAssetHelper.kt +51 -6
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +159 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxOnlineSttHelper.kt +4 -1
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +112 -97
- package/ios/Resources/model_licenses/asr-models-license-status.csv +409 -0
- package/ios/Resources/model_licenses/qnn-asr-models-license-status.csv +695 -0
- package/ios/Resources/model_licenses/tts-models-license-status.csv +596 -0
- package/ios/SherpaOnnx+OnlineSTT.mm +2 -0
- package/ios/SherpaOnnx+PcmLiveStream.mm +2 -29
- package/ios/SherpaOnnx+TTS.mm +178 -20
- package/ios/SherpaOnnx.mm +108 -1
- package/ios/SherpaOnnxAudioConvert.h +10 -0
- package/ios/SherpaOnnxAudioConvert.mm +257 -1
- package/ios/archive/sherpa-onnx-archive-helper.h +10 -0
- package/ios/archive/sherpa-onnx-archive-helper.mm +56 -5
- package/ios/model_detect/sherpa-onnx-model-detect-tts.mm +13 -2
- package/ios/model_detect/sherpa-onnx-validate-tts.mm +4 -2
- package/ios/online_stt/sherpa-onnx-online-stt-wrapper.h +1 -0
- package/ios/online_stt/sherpa-onnx-online-stt-wrapper.mm +4 -0
- package/ios/tts/sherpa-onnx-tts-wrapper.h +37 -0
- package/ios/tts/sherpa-onnx-tts-wrapper.mm +149 -3
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/audio/index.js +8 -0
- package/lib/module/audio/index.js.map +1 -1
- package/lib/module/download/ModelDownloadManager.js +10 -929
- package/lib/module/download/ModelDownloadManager.js.map +1 -1
- package/lib/module/download/activeModelOperations.js +26 -0
- package/lib/module/download/activeModelOperations.js.map +1 -0
- package/lib/module/download/background-downloader.d.js +2 -0
- package/lib/module/download/background-downloader.d.js.map +1 -0
- package/lib/module/download/bulkPurge.js +72 -0
- package/lib/module/download/bulkPurge.js.map +1 -0
- package/lib/module/download/checksumPrompt.js +19 -0
- package/lib/module/download/checksumPrompt.js.map +1 -0
- package/lib/module/download/constants.js +7 -0
- package/lib/module/download/constants.js.map +1 -0
- package/lib/module/download/downloadEvents.js +35 -0
- package/lib/module/download/downloadEvents.js.map +1 -0
- package/lib/module/download/downloadTask.js +385 -0
- package/lib/module/download/downloadTask.js.map +1 -0
- package/lib/module/download/ensureModel.js +89 -0
- package/lib/module/download/ensureModel.js.map +1 -0
- package/lib/module/download/index.js +4 -3
- package/lib/module/download/index.js.map +1 -1
- package/lib/module/download/localModels.js +151 -0
- package/lib/module/download/localModels.js.map +1 -0
- package/lib/module/download/modelExtraction.js +174 -0
- package/lib/module/download/modelExtraction.js.map +1 -0
- package/lib/module/download/paths.js +98 -0
- package/lib/module/download/paths.js.map +1 -0
- package/lib/module/download/postDownloadProcessing.js +206 -0
- package/lib/module/download/postDownloadProcessing.js.map +1 -0
- package/lib/module/download/protectedModelKeys.js +31 -0
- package/lib/module/download/protectedModelKeys.js.map +1 -0
- package/lib/module/download/registry.js +267 -0
- package/lib/module/download/registry.js.map +1 -0
- package/lib/module/download/retry.js +59 -0
- package/lib/module/download/retry.js.map +1 -0
- package/lib/module/download/types.js +17 -0
- package/lib/module/download/types.js.map +1 -0
- package/lib/module/download/validation.js +101 -5
- package/lib/module/download/validation.js.map +1 -1
- package/lib/module/{download → extraction}/extractTarBz2.js +3 -1
- package/lib/module/extraction/extractTarBz2.js.map +1 -0
- package/lib/module/extraction/extractTarZst.js +54 -0
- package/lib/module/extraction/extractTarZst.js.map +1 -0
- package/lib/module/extraction/index.js +190 -0
- package/lib/module/extraction/index.js.map +1 -0
- package/lib/module/extraction/types.js +2 -0
- package/lib/module/extraction/types.js.map +1 -0
- package/lib/module/index.js +2 -1
- package/lib/module/index.js.map +1 -1
- package/lib/module/licenses.js +63 -0
- package/lib/module/licenses.js.map +1 -0
- package/lib/module/stt/index.js +16 -2
- package/lib/module/stt/index.js.map +1 -1
- package/lib/module/stt/streaming.js +2 -0
- package/lib/module/stt/streaming.js.map +1 -1
- package/lib/module/stt/streamingTypes.js.map +1 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +20 -2
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/streaming.js +4 -0
- package/lib/module/tts/streaming.js.map +1 -1
- package/lib/module/tts/types.js.map +1 -1
- package/lib/module/utils.js +16 -1
- package/lib/module/utils.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +72 -5
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/audio/index.d.ts +10 -0
- package/lib/typescript/src/audio/index.d.ts.map +1 -1
- package/lib/typescript/src/download/ModelDownloadManager.d.ts +10 -108
- package/lib/typescript/src/download/ModelDownloadManager.d.ts.map +1 -1
- package/lib/typescript/src/download/activeModelOperations.d.ts +6 -0
- package/lib/typescript/src/download/activeModelOperations.d.ts.map +1 -0
- package/lib/typescript/src/download/bulkPurge.d.ts +14 -0
- package/lib/typescript/src/download/bulkPurge.d.ts.map +1 -0
- package/lib/typescript/src/download/checksumPrompt.d.ts +3 -0
- package/lib/typescript/src/download/checksumPrompt.d.ts.map +1 -0
- package/lib/typescript/src/download/constants.d.ts +5 -0
- package/lib/typescript/src/download/constants.d.ts.map +1 -0
- package/lib/typescript/src/download/downloadEvents.d.ts +6 -0
- package/lib/typescript/src/download/downloadEvents.d.ts.map +1 -0
- package/lib/typescript/src/download/downloadTask.d.ts +20 -0
- package/lib/typescript/src/download/downloadTask.d.ts.map +1 -0
- package/lib/typescript/src/download/ensureModel.d.ts +26 -0
- package/lib/typescript/src/download/ensureModel.d.ts.map +1 -0
- package/lib/typescript/src/download/index.d.ts +7 -5
- package/lib/typescript/src/download/index.d.ts.map +1 -1
- package/lib/typescript/src/download/localModels.d.ts +15 -0
- package/lib/typescript/src/download/localModels.d.ts.map +1 -0
- package/lib/typescript/src/download/modelExtraction.d.ts +36 -0
- package/lib/typescript/src/download/modelExtraction.d.ts.map +1 -0
- package/lib/typescript/src/download/paths.d.ts +28 -0
- package/lib/typescript/src/download/paths.d.ts.map +1 -0
- package/lib/typescript/src/download/postDownloadProcessing.d.ts +19 -0
- package/lib/typescript/src/download/postDownloadProcessing.d.ts.map +1 -0
- package/lib/typescript/src/download/protectedModelKeys.d.ts +6 -0
- package/lib/typescript/src/download/protectedModelKeys.d.ts.map +1 -0
- package/lib/typescript/src/download/registry.d.ts +14 -0
- package/lib/typescript/src/download/registry.d.ts.map +1 -0
- package/lib/typescript/src/download/retry.d.ts +15 -0
- package/lib/typescript/src/download/retry.d.ts.map +1 -0
- package/lib/typescript/src/download/types.d.ts +96 -0
- package/lib/typescript/src/download/types.d.ts.map +1 -0
- package/lib/typescript/src/download/validation.d.ts +19 -0
- package/lib/typescript/src/download/validation.d.ts.map +1 -1
- package/lib/typescript/src/extraction/extractTarBz2.d.ts.map +1 -0
- package/lib/typescript/src/extraction/extractTarZst.d.ts +14 -0
- package/lib/typescript/src/extraction/extractTarZst.d.ts.map +1 -0
- package/lib/typescript/src/extraction/index.d.ts +50 -0
- package/lib/typescript/src/extraction/index.d.ts.map +1 -0
- package/lib/typescript/src/extraction/types.d.ts +60 -0
- package/lib/typescript/src/extraction/types.d.ts.map +1 -0
- package/lib/typescript/src/index.d.ts +1 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/licenses.d.ts +10 -0
- package/lib/typescript/src/licenses.d.ts.map +1 -0
- package/lib/typescript/src/stt/index.d.ts +4 -1
- package/lib/typescript/src/stt/index.d.ts.map +1 -1
- package/lib/typescript/src/stt/streaming.d.ts.map +1 -1
- package/lib/typescript/src/stt/streamingTypes.d.ts +5 -0
- package/lib/typescript/src/stt/streamingTypes.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +3 -1
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +3 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/streaming.d.ts.map +1 -1
- package/lib/typescript/src/tts/types.d.ts +6 -5
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/lib/typescript/src/utils.d.ts +5 -0
- package/lib/typescript/src/utils.d.ts.map +1 -1
- package/package.json +11 -1
- package/scripts/{check-model-csvs.sh → ci/check-model-csvs.sh} +9 -2
- package/scripts/ci/collect_all_sherpa_model_streams.sh +101 -0
- package/scripts/ci/collect_one_sherpa_release_stream.sh +189 -0
- package/scripts/ci/sherpa_asr_model_release_streams.json +21 -0
- package/scripts/ci/sherpa_tts_model_release_streams.json +13 -0
- package/scripts/ci/update_model_license_csv.sh +765 -0
- package/scripts/setup-ios-framework.sh +14 -11
- package/scripts/update_commercial_use.js +73 -0
- package/src/NativeSherpaOnnx.ts +92 -5
- package/src/audio/index.ts +20 -0
- package/src/download/ModelDownloadManager.ts +55 -1343
- package/src/download/activeModelOperations.ts +38 -0
- package/src/download/background-downloader.d.ts +43 -0
- package/src/download/bulkPurge.ts +102 -0
- package/src/download/checksumPrompt.ts +25 -0
- package/src/download/constants.ts +5 -0
- package/src/download/downloadEvents.ts +55 -0
- package/src/download/downloadTask.ts +497 -0
- package/src/download/ensureModel.ts +124 -0
- package/src/download/index.ts +19 -2
- package/src/download/localModels.ts +234 -0
- package/src/download/modelExtraction.ts +244 -0
- package/src/download/paths.ts +134 -0
- package/src/download/postDownloadProcessing.ts +292 -0
- package/src/download/protectedModelKeys.ts +30 -0
- package/src/download/registry.ts +404 -0
- package/src/download/retry.ts +76 -0
- package/src/download/types.ts +120 -0
- package/src/download/validation.ts +114 -8
- package/src/{download → extraction}/extractTarBz2.ts +3 -1
- package/src/extraction/extractTarZst.ts +79 -0
- package/src/extraction/index.ts +269 -0
- package/src/extraction/types.ts +63 -0
- package/src/index.tsx +2 -0
- package/src/licenses.ts +100 -0
- package/src/stt/index.ts +20 -2
- package/src/stt/streaming.ts +3 -0
- package/src/stt/streamingTypes.ts +5 -0
- package/src/stt/types.ts +3 -1
- package/src/tts/index.ts +30 -2
- package/src/tts/streaming.ts +10 -0
- package/src/tts/types.ts +6 -5
- package/src/utils.ts +22 -1
- package/third_party/libarchive_prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/libarchive_prebuilt/IOS_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/ANDROID_RELEASE_TAG +1 -1
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
- package/android/src/main/cpp/jni/tts/sherpa-onnx-tts-zipvoice-jni.cpp +0 -301
- package/android/src/main/java/com/sherpaonnx/ZipvoiceTtsWrapper.kt +0 -187
- package/lib/module/download/extractTarBz2.js.map +0 -1
- package/lib/typescript/src/download/extractTarBz2.d.ts.map +0 -1
- package/scripts/check-qnn-support.sh +0 -78
- /package/lib/typescript/src/{download → extraction}/extractTarBz2.d.ts +0 -0
|
@@ -0,0 +1,765 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Update model-license CSV from release asset list and pre-collected tree-cache.
|
|
3
|
+
#
|
|
4
|
+
# Goal: map each release asset (same names as *-models-expected.csv) to license_type and
|
|
5
|
+
# commercial_use hints for app distribution (ads, IAP). Not legal advice.
|
|
6
|
+
#
|
|
7
|
+
# Behavior:
|
|
8
|
+
# - Reads existing CSV if present; preserves rows and manual edits.
|
|
9
|
+
# - Merges in all assets from asset-list.txt (release); adds new rows with empty license_type.
|
|
10
|
+
# - Skips any asset with detection_source `manual` (hand-maintained row; never overwritten).
|
|
11
|
+
# - Skips any asset whose license_type and commercial_use are both set and neither is `unknown`
|
|
12
|
+
# (case-insensitive). Rows with empty or `unknown` license_type and/or empty or `unknown`
|
|
13
|
+
# commercial_use are (re)processed. license_type `exhausted` is never auto-overwritten.
|
|
14
|
+
# - Uses tree-cache (from asr/tts-models-structure.txt + new downloads) to see if a LICENSE-like
|
|
15
|
+
# path exists — no full extract unless we need file contents for detection.
|
|
16
|
+
# - Downloads the .tar.bz2 only when a license-like path was found and license_type is still empty.
|
|
17
|
+
# - Pipeline: try archive (if applicable) → HF/ModelScope fallbacks for eligible assets. If no license
|
|
18
|
+
# is found after all attempts, license_type is set to exhausted (default keyword; override with
|
|
19
|
+
# LICENSE_EXHAUSTED env). You can set exhausted manually after review.
|
|
20
|
+
# - .onnx-only: exhausted (no archive to scan).
|
|
21
|
+
# - HF fallback (vits-piper-*.tar.bz2, sherpa-onnx-*.tar.bz2): repo slug = asset basename without .tar.bz2
|
|
22
|
+
# under HF_MODEL_OWNER (default csukuangfj). Try MODEL_CARD (* License: …) then README.md YAML
|
|
23
|
+
# (---\nlicense: …). First successful source wins (HF before ModelScope). Only if HF has no license but
|
|
24
|
+
# README.md links to modelscope.cn/models/…, fetch that /summary HTML and read License from
|
|
25
|
+
# window.__detail_data__ JSON (\"License\":\"…\").
|
|
26
|
+
# license_file = HF repo URL or ModelScope summary URL; detection_source = huggingface_model_card or
|
|
27
|
+
# modelscope_detail_json. Release tarball names must match HF repo names or fetch 404s.
|
|
28
|
+
# - QNN binary stream (see --stream-id asr-models-qnn-binary, or QNN in asset name, or qnn-*-license-status.csv):
|
|
29
|
+
# after archive scan + HF fallback still yield nothing, last resort looks up a matching row in
|
|
30
|
+
# asr-models-license-status.csv (default: same directory as --csv). Strip prefix
|
|
31
|
+
# sherpa-onnx-qnn-<soc>-binary-<n>-seconds- from the QNN asset name, then try a few derived filenames
|
|
32
|
+
# (exact, sherpa-onnx-…, and sherpa-onnx-<stem>.tar.bz2 when …-int8.tar.bz2). On match, copy the ASR row’s
|
|
33
|
+
# license fields (not asset_name) onto the QNN asset; on no match → exhausted like other dead ends.
|
|
34
|
+
# - Hugging Face: set HF_TOKEN or HUGGINGFACE_HUB_TOKEN (read token is enough for public repos). Anonymous
|
|
35
|
+
# requests from CI often get HTTP 401; without a token README/MODEL_CARD cannot be fetched.
|
|
36
|
+
#
|
|
37
|
+
# Note: With `set -u`, ${#empty_assoc[@]} and ${!empty_assoc[@]} can error on some Bash builds;
|
|
38
|
+
# we avoid that below.
|
|
39
|
+
|
|
40
|
+
set -euo pipefail
|
|
41
|
+
|
|
42
|
+
if (( BASH_VERSINFO[0] < 4 )); then
|
|
43
|
+
echo "This script requires Bash version 4+ (for associative arrays)." >&2
|
|
44
|
+
exit 1
|
|
45
|
+
fi
|
|
46
|
+
|
|
47
|
+
ASSET_LIST=""
|
|
48
|
+
TREE_CACHE_DIR=""
|
|
49
|
+
CSV_FILE=""
|
|
50
|
+
STREAM_ID=""
|
|
51
|
+
ASR_LICENSE_CSV=""
|
|
52
|
+
|
|
53
|
+
while [[ $# -gt 0 ]]; do
|
|
54
|
+
case $1 in
|
|
55
|
+
--asset-list) ASSET_LIST="$2"; shift 2 ;;
|
|
56
|
+
--tree-cache-dir) TREE_CACHE_DIR="$2"; shift 2 ;;
|
|
57
|
+
--csv) CSV_FILE="$2"; shift 2 ;;
|
|
58
|
+
--stream-id) STREAM_ID="$2"; shift 2 ;;
|
|
59
|
+
--asr-license-csv) ASR_LICENSE_CSV="$2"; shift 2 ;;
|
|
60
|
+
*) echo "Unknown parameter $1"; exit 1 ;;
|
|
61
|
+
esac
|
|
62
|
+
done
|
|
63
|
+
|
|
64
|
+
if [[ -z "$ASSET_LIST" || -z "$TREE_CACHE_DIR" || -z "$CSV_FILE" ]]; then
|
|
65
|
+
echo "Usage: $0 --asset-list <path> --tree-cache-dir <dir> --csv <path> [--stream-id <id>] [--asr-license-csv <path>]"
|
|
66
|
+
exit 1
|
|
67
|
+
fi
|
|
68
|
+
|
|
69
|
+
if [[ -z "$ASR_LICENSE_CSV" ]]; then
|
|
70
|
+
ASR_LICENSE_CSV="$(cd "$(dirname "$CSV_FILE")" && pwd)/asr-models-license-status.csv"
|
|
71
|
+
fi
|
|
72
|
+
|
|
73
|
+
# Authenticated GitHub downloads (CI: GITHUB_TOKEN; local: GITHUB_TOKEN or GH_TOKEN).
|
|
74
|
+
_GH_TOKEN="${GITHUB_TOKEN:-${GH_TOKEN:-}}"
|
|
75
|
+
# Hugging Face raw file fetches (CI: often required to avoid 401 on huggingface.co).
|
|
76
|
+
_HF_TOKEN="${HF_TOKEN:-${HUGGINGFACE_HUB_TOKEN:-}}"
|
|
77
|
+
# Hugging Face repo slug matches release asset name without .tar.bz2 (e.g. vits-piper-pl_PL-darkman-medium).
|
|
78
|
+
HF_MODEL_OWNER="${HF_MODEL_OWNER:-csukuangfj}"
|
|
79
|
+
# license_type when all automated sources were tried and none yielded a license (skip on future runs).
|
|
80
|
+
LICENSE_EXHAUSTED="${LICENSE_EXHAUSTED:-exhausted}"
|
|
81
|
+
|
|
82
|
+
declare -A LICENSE_LIKE_BASENAMES=(
|
|
83
|
+
["license"]=1 ["license.txt"]=1 ["licence"]=1 ["licence.txt"]=1
|
|
84
|
+
["copying"]=1 ["copying.txt"]=1 ["notice"]=1 ["notice.txt"]=1
|
|
85
|
+
["copyright"]=1 ["copyright.txt"]=1 ["model_license"]=1 ["model_license.txt"]=1
|
|
86
|
+
["license.md"]=1 ["licence.md"]=1 ["copying.md"]=1 ["notice.md"]=1
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
declare -A existing_asset_name
|
|
90
|
+
declare -A existing_license_type
|
|
91
|
+
declare -A existing_commercial_use
|
|
92
|
+
declare -A existing_confidence
|
|
93
|
+
declare -A existing_detection_source
|
|
94
|
+
declare -A existing_license_file
|
|
95
|
+
|
|
96
|
+
read_csv() {
|
|
97
|
+
local csv_path="$1"
|
|
98
|
+
if [[ ! -f "$csv_path" ]]; then return; fi
|
|
99
|
+
|
|
100
|
+
local is_header=1
|
|
101
|
+
while IFS=, read -r asset_name license_type commercial_use confidence detection_source license_file remainder; do
|
|
102
|
+
# Remove carriage returns
|
|
103
|
+
asset_name="${asset_name%$'\r'}"
|
|
104
|
+
license_file="${license_file%$'\r'}"
|
|
105
|
+
if [[ "$is_header" -eq 1 ]]; then
|
|
106
|
+
is_header=0
|
|
107
|
+
continue
|
|
108
|
+
fi
|
|
109
|
+
# strip quotes
|
|
110
|
+
asset_name="${asset_name%\"}"; asset_name="${asset_name#\"}"
|
|
111
|
+
if [[ -z "$asset_name" ]]; then continue; fi
|
|
112
|
+
|
|
113
|
+
existing_asset_name["$asset_name"]="$asset_name"
|
|
114
|
+
|
|
115
|
+
license_type="${license_type%\"}"; license_type="${license_type#\"}"
|
|
116
|
+
existing_license_type["$asset_name"]="$license_type"
|
|
117
|
+
|
|
118
|
+
commercial_use="${commercial_use%\"}"; commercial_use="${commercial_use#\"}"
|
|
119
|
+
existing_commercial_use["$asset_name"]="$commercial_use"
|
|
120
|
+
|
|
121
|
+
confidence="${confidence%\"}"; confidence="${confidence#\"}"
|
|
122
|
+
existing_confidence["$asset_name"]="$confidence"
|
|
123
|
+
|
|
124
|
+
detection_source="${detection_source%\"}"; detection_source="${detection_source#\"}"
|
|
125
|
+
existing_detection_source["$asset_name"]="$detection_source"
|
|
126
|
+
|
|
127
|
+
license_file="${license_file%\"}"; license_file="${license_file#\"}"
|
|
128
|
+
existing_license_file["$asset_name"]="$license_file"
|
|
129
|
+
done < "$csv_path"
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
read_csv "$CSV_FILE"
|
|
133
|
+
|
|
134
|
+
# Row count for logging (avoid ${#assoc[@]} on empty assoc under set -u on some Bash versions).
|
|
135
|
+
existing_csv_rows=0
|
|
136
|
+
if [[ -f "$CSV_FILE" ]]; then
|
|
137
|
+
existing_csv_rows=$(($(grep -cve '^[[:space:]]*$' "$CSV_FILE" 2>/dev/null || echo 0)))
|
|
138
|
+
((existing_csv_rows > 0)) && ((existing_csv_rows--)) # minus header
|
|
139
|
+
((existing_csv_rows < 0)) && existing_csv_rows=0
|
|
140
|
+
fi
|
|
141
|
+
|
|
142
|
+
echo "=== update_model_license_csv.sh ==="
|
|
143
|
+
echo "CSV path: $CSV_FILE"
|
|
144
|
+
[[ -n "$STREAM_ID" ]] && echo "Stream id: $STREAM_ID"
|
|
145
|
+
echo "ASR license lookup (QNN fallback): $ASR_LICENSE_CSV"
|
|
146
|
+
echo "Existing data rows in CSV (excl. header, by line count): $existing_csv_rows"
|
|
147
|
+
|
|
148
|
+
declare -a release_assets=()
|
|
149
|
+
declare -A asset_urls=()
|
|
150
|
+
|
|
151
|
+
if [[ -f "$ASSET_LIST" ]]; then
|
|
152
|
+
while IFS='|' read -r name url; do
|
|
153
|
+
name="${name%$'\r'}"
|
|
154
|
+
url="${url%$'\r'}"
|
|
155
|
+
# trim spaces
|
|
156
|
+
name="$(echo -n "$name" | xargs)"
|
|
157
|
+
url="$(echo -n "$url" | xargs)"
|
|
158
|
+
if [[ -n "$name" ]]; then
|
|
159
|
+
release_assets+=("$name")
|
|
160
|
+
asset_urls["$name"]="$url"
|
|
161
|
+
if [[ -z "${existing_asset_name["$name"]:-}" ]]; then
|
|
162
|
+
existing_asset_name["$name"]="$name"
|
|
163
|
+
existing_license_type["$name"]=""
|
|
164
|
+
existing_commercial_use["$name"]=""
|
|
165
|
+
existing_confidence["$name"]=""
|
|
166
|
+
existing_detection_source["$name"]=""
|
|
167
|
+
existing_license_file["$name"]=""
|
|
168
|
+
fi
|
|
169
|
+
fi
|
|
170
|
+
done < "$ASSET_LIST"
|
|
171
|
+
fi
|
|
172
|
+
|
|
173
|
+
echo "Asset list file: ${ASSET_LIST:-<none>}"
|
|
174
|
+
echo "Tree cache dir: $TREE_CACHE_DIR"
|
|
175
|
+
echo "Release assets to consider: ${#release_assets[@]}"
|
|
176
|
+
if [[ ${#release_assets[@]} -eq 0 ]]; then
|
|
177
|
+
echo "Note: empty asset list — output CSV will only contain header plus any assets already in CSV but not on release (sorted)."
|
|
178
|
+
fi
|
|
179
|
+
echo "--- per-asset license pass ---"
|
|
180
|
+
|
|
181
|
+
get_safe_name() {
|
|
182
|
+
local name="$1"
|
|
183
|
+
name="${name//\//-}"
|
|
184
|
+
name="${name//\\/-}"
|
|
185
|
+
echo "$name"
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
set_exhausted() {
|
|
189
|
+
local name="$1"
|
|
190
|
+
existing_license_type["$name"]="$LICENSE_EXHAUSTED"
|
|
191
|
+
existing_commercial_use["$name"]="unknown"
|
|
192
|
+
existing_confidence["$name"]="high"
|
|
193
|
+
existing_detection_source["$name"]="scan_exhausted"
|
|
194
|
+
existing_license_file["$name"]=""
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
set_detected() {
|
|
198
|
+
local name="$1"
|
|
199
|
+
local l_type="$2"
|
|
200
|
+
local c_use="$3"
|
|
201
|
+
local conf="$4"
|
|
202
|
+
local file="$5"
|
|
203
|
+
existing_license_type["$name"]="$l_type"
|
|
204
|
+
existing_commercial_use["$name"]="$c_use"
|
|
205
|
+
existing_confidence["$name"]="$conf"
|
|
206
|
+
existing_detection_source["$name"]="archive_license_file"
|
|
207
|
+
existing_license_file["$name"]="$file"
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
set_hf_model_card() {
|
|
211
|
+
local name="$1"
|
|
212
|
+
local l_type="$2"
|
|
213
|
+
local c_use="$3"
|
|
214
|
+
local conf="$4"
|
|
215
|
+
local page_url="$5"
|
|
216
|
+
local detection_src="${6:-huggingface_model_card}"
|
|
217
|
+
existing_license_type["$name"]="$l_type"
|
|
218
|
+
existing_commercial_use["$name"]="$c_use"
|
|
219
|
+
existing_confidence["$name"]="$conf"
|
|
220
|
+
existing_detection_source["$name"]="$detection_src"
|
|
221
|
+
existing_license_file["$name"]="$page_url"
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
# Prints file body to stdout; returns 0 on HTTP success. Suppresses curl stderr (expected 404 on MODEL_CARD).
|
|
225
|
+
fetch_hf_repo_file() {
|
|
226
|
+
local slug="$1"
|
|
227
|
+
local filename="$2"
|
|
228
|
+
local -a _hf_curl=(-sfSL)
|
|
229
|
+
if [[ -n "$_HF_TOKEN" ]]; then
|
|
230
|
+
_hf_curl+=(-H "Authorization: Bearer ${_HF_TOKEN}")
|
|
231
|
+
fi
|
|
232
|
+
_hf_curl+=("https://huggingface.co/${HF_MODEL_OWNER}/${slug}/raw/main/${filename}")
|
|
233
|
+
curl "${_hf_curl[@]}" 2>/dev/null
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# Extracts the first "* License: value" line (case-insensitive on the label).
|
|
237
|
+
parse_model_card_license_field() {
|
|
238
|
+
local card="$1"
|
|
239
|
+
local line lic
|
|
240
|
+
while IFS= read -r line || [[ -n "$line" ]]; do
|
|
241
|
+
line="${line%$'\r'}"
|
|
242
|
+
if [[ "$line" =~ ^[*][[:space:]]*[Ll]icense:[[:space:]]*(.*) ]]; then
|
|
243
|
+
lic="${BASH_REMATCH[1]}"
|
|
244
|
+
lic="$(echo -n "$lic" | xargs)"
|
|
245
|
+
if [[ -n "$lic" ]]; then
|
|
246
|
+
echo -n "$lic"
|
|
247
|
+
return 0
|
|
248
|
+
fi
|
|
249
|
+
fi
|
|
250
|
+
done <<< "$card"
|
|
251
|
+
return 1
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
# Hugging Face model cards often use YAML front matter: ---\nlicense: apache-2.0\n---
|
|
255
|
+
parse_readme_yaml_license_field() {
|
|
256
|
+
local readme="$1"
|
|
257
|
+
local line val in_fm=0
|
|
258
|
+
|
|
259
|
+
while IFS= read -r line || [[ -n "$line" ]]; do
|
|
260
|
+
line="${line%$'\r'}"
|
|
261
|
+
if [[ "$line" == "---" ]]; then
|
|
262
|
+
if [[ "$in_fm" -eq 0 ]]; then
|
|
263
|
+
in_fm=1
|
|
264
|
+
elif [[ "$in_fm" -eq 1 ]]; then
|
|
265
|
+
break
|
|
266
|
+
fi
|
|
267
|
+
continue
|
|
268
|
+
fi
|
|
269
|
+
if [[ "$in_fm" -eq 1 ]] && [[ "$line" =~ ^[Ll]icense:[[:space:]]*(.*) ]]; then
|
|
270
|
+
val="${BASH_REMATCH[1]}"
|
|
271
|
+
val="$(echo -n "$val" | xargs)"
|
|
272
|
+
val="${val#\"}"; val="${val%\"}"
|
|
273
|
+
val="${val#\'}"; val="${val%\'}"
|
|
274
|
+
if [[ -n "$val" ]]; then
|
|
275
|
+
echo -n "$val"
|
|
276
|
+
return 0
|
|
277
|
+
fi
|
|
278
|
+
fi
|
|
279
|
+
done <<< "$readme"
|
|
280
|
+
|
|
281
|
+
while IFS= read -r line || [[ -n "$line" ]]; do
|
|
282
|
+
line="${line%$'\r'}"
|
|
283
|
+
if [[ "$line" =~ ^[Ll]icense:[[:space:]]*(.*) ]]; then
|
|
284
|
+
val="${BASH_REMATCH[1]}"
|
|
285
|
+
val="$(echo -n "$val" | xargs)"
|
|
286
|
+
val="${val#\"}"; val="${val%\"}"
|
|
287
|
+
val="${val#\'}"; val="${val%\'}"
|
|
288
|
+
if [[ -n "$val" ]]; then
|
|
289
|
+
echo -n "$val"
|
|
290
|
+
return 0
|
|
291
|
+
fi
|
|
292
|
+
fi
|
|
293
|
+
done <<< "$readme"
|
|
294
|
+
return 1
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
# First https://modelscope.cn/models/… URL in text (HF README often links here without YAML license).
|
|
298
|
+
extract_first_modelscope_models_url() {
|
|
299
|
+
local readme="$1"
|
|
300
|
+
local url
|
|
301
|
+
url="$(printf '%s\n' "$readme" | grep -oE 'https?://(www\.)?modelscope\.cn/models/[A-Za-z0-9_./%-]+' | head -1)"
|
|
302
|
+
if [[ -z "$url" ]]; then
|
|
303
|
+
url="$(printf '%s\n' "$readme" | grep -oE '(www\.)?modelscope\.cn/models/[A-Za-z0-9_./%-]+' | head -1)"
|
|
304
|
+
if [[ -n "$url" && "$url" != http://* && "$url" != https://* ]]; then
|
|
305
|
+
url="https://${url}"
|
|
306
|
+
fi
|
|
307
|
+
fi
|
|
308
|
+
[[ -n "$url" ]] || return 1
|
|
309
|
+
echo -n "$url"
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
# ModelScope model pages embed JSON in HTML; License field uses escaped quotes: \"License\":\"Apache License 2.0\"
|
|
313
|
+
normalize_modelscope_summary_url() {
|
|
314
|
+
local u="$1"
|
|
315
|
+
u="${u%%\?*}"
|
|
316
|
+
u="${u%/}"
|
|
317
|
+
if [[ "$u" != */summary ]]; then
|
|
318
|
+
u="${u}/summary"
|
|
319
|
+
fi
|
|
320
|
+
echo -n "$u"
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
fetch_modelscope_summary_html() {
|
|
324
|
+
local url="$1"
|
|
325
|
+
url="$(normalize_modelscope_summary_url "$url")"
|
|
326
|
+
local -a _ms_curl=(-sfSL -A "Mozilla/5.0 (compatible; react-native-sherpa-onnx-license-update/1.0)")
|
|
327
|
+
_ms_curl+=("$url")
|
|
328
|
+
curl "${_ms_curl[@]}" 2>/dev/null
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
parse_modelscope_license_from_html() {
|
|
332
|
+
local html="$1"
|
|
333
|
+
local lic
|
|
334
|
+
lic="$(printf '%s' "$html" | sed -n 's/.*License\\":\\"\([^\\]*\)\\".*/\1/p' | head -1)"
|
|
335
|
+
lic="$(echo -n "$lic" | xargs)"
|
|
336
|
+
if [[ -n "$lic" ]]; then
|
|
337
|
+
echo -n "$lic"
|
|
338
|
+
return 0
|
|
339
|
+
fi
|
|
340
|
+
lic="$(printf '%s' "$html" | sed -n 's/.*"License":"\([^"]*\)".*/\1/p' | head -1)"
|
|
341
|
+
lic="$(echo -n "$lic" | xargs)"
|
|
342
|
+
if [[ -n "$lic" ]]; then
|
|
343
|
+
echo -n "$lic"
|
|
344
|
+
return 0
|
|
345
|
+
fi
|
|
346
|
+
return 1
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
asset_eligible_for_hf_license_fallback() {
|
|
350
|
+
local asset_name="$1"
|
|
351
|
+
[[ "$asset_name" == vits-piper-*.tar.bz2 || "$asset_name" == sherpa-onnx-*.tar.bz2 ]]
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
# After try_hf_model_card_fallback succeeds, existing_detection_source is set — map to log label.
|
|
355
|
+
log_license_fallback_source() {
|
|
356
|
+
local name="$1"
|
|
357
|
+
case "${existing_detection_source["$name"]:-}" in
|
|
358
|
+
modelscope_detail_json)
|
|
359
|
+
echo "ModelScope (via link in Hugging Face README)"
|
|
360
|
+
;;
|
|
361
|
+
huggingface_model_card)
|
|
362
|
+
echo "Hugging Face (MODEL_CARD or README)"
|
|
363
|
+
;;
|
|
364
|
+
*)
|
|
365
|
+
echo "online metadata"
|
|
366
|
+
;;
|
|
367
|
+
esac
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
# Try MODEL_CARD, then README.md YAML; only if still no license, follow modelscope.cn link from README.
|
|
371
|
+
try_hf_model_card_fallback() {
|
|
372
|
+
local asset_name="$1"
|
|
373
|
+
local slug page_url card readme raw_lic det l_res c_res conf_res
|
|
374
|
+
local license_ref_url license_ref_src ms_url ms_html ms_raw
|
|
375
|
+
|
|
376
|
+
asset_eligible_for_hf_license_fallback "$asset_name" || return 1
|
|
377
|
+
|
|
378
|
+
slug="${asset_name%.tar.bz2}"
|
|
379
|
+
page_url="https://huggingface.co/${HF_MODEL_OWNER}/${slug}"
|
|
380
|
+
license_ref_url="$page_url"
|
|
381
|
+
license_ref_src="huggingface_model_card"
|
|
382
|
+
|
|
383
|
+
readme=""
|
|
384
|
+
raw_lic=""
|
|
385
|
+
if card="$(fetch_hf_repo_file "$slug" "MODEL_CARD")"; then
|
|
386
|
+
raw_lic="$(parse_model_card_license_field "$card")" || raw_lic=""
|
|
387
|
+
fi
|
|
388
|
+
if [[ -z "$raw_lic" ]]; then
|
|
389
|
+
if readme="$(fetch_hf_repo_file "$slug" "README.md")"; then
|
|
390
|
+
raw_lic="$(parse_readme_yaml_license_field "$readme")" || raw_lic=""
|
|
391
|
+
fi
|
|
392
|
+
fi
|
|
393
|
+
|
|
394
|
+
# ModelScope only when HF did not yield a license (README must have been fetched and link MS).
|
|
395
|
+
if [[ -z "$raw_lic" && -n "$readme" ]] && ms_url="$(extract_first_modelscope_models_url "$readme")"; then
|
|
396
|
+
ms_html="$(fetch_modelscope_summary_html "$ms_url")" || ms_html=""
|
|
397
|
+
if [[ -n "$ms_html" ]] && ms_raw="$(parse_modelscope_license_from_html "$ms_html")"; then
|
|
398
|
+
raw_lic="$ms_raw"
|
|
399
|
+
license_ref_url="$(normalize_modelscope_summary_url "$ms_url")"
|
|
400
|
+
license_ref_src="modelscope_detail_json"
|
|
401
|
+
fi
|
|
402
|
+
fi
|
|
403
|
+
|
|
404
|
+
[[ -n "$raw_lic" ]] || return 1
|
|
405
|
+
|
|
406
|
+
det="$(detect_license "$raw_lic")"
|
|
407
|
+
l_res="$(echo "$det" | cut -d'|' -f1)"
|
|
408
|
+
c_res="$(echo "$det" | cut -d'|' -f2)"
|
|
409
|
+
conf_res="$(echo "$det" | cut -d'|' -f3)"
|
|
410
|
+
|
|
411
|
+
if [[ "$l_res" == "unknown" ]]; then
|
|
412
|
+
set_hf_model_card "$asset_name" "$raw_lic" "unknown" "medium" "$license_ref_url" "$license_ref_src"
|
|
413
|
+
else
|
|
414
|
+
set_hf_model_card "$asset_name" "$l_res" "$c_res" "$conf_res" "$license_ref_url" "$license_ref_src"
|
|
415
|
+
fi
|
|
416
|
+
return 0
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
# QNN binary assets: mirror license row from asr-models-license-status.csv (last resort).
|
|
420
|
+
qnn_license_fallback_context() {
|
|
421
|
+
[[ "${STREAM_ID:-}" == "asr-models-qnn-binary" ]] && return 0
|
|
422
|
+
[[ "$(basename "$CSV_FILE")" == "qnn-asr-models-license-status.csv" ]] && return 0
|
|
423
|
+
[[ "$1" == *[Qq][Nn][Nn]* ]] && return 0
|
|
424
|
+
return 1
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
strip_qnn_binary_asset_prefix() {
|
|
428
|
+
local n="$1"
|
|
429
|
+
if [[ "$n" =~ ^sherpa-onnx-qnn-[^-]+-binary-[0-9]+-seconds-(.+)$ ]]; then
|
|
430
|
+
echo -n "${BASH_REMATCH[1]}"
|
|
431
|
+
return 0
|
|
432
|
+
fi
|
|
433
|
+
return 1
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
# First CSV data row whose first field equals want (after stripping CR); empty if none.
|
|
437
|
+
asr_license_csv_row_for_asset_name() {
|
|
438
|
+
local csv="$1"
|
|
439
|
+
local want="$2"
|
|
440
|
+
[[ -f "$csv" ]] || return 1
|
|
441
|
+
awk -F',' -v n="$want" '
|
|
442
|
+
NR == 1 { next }
|
|
443
|
+
{
|
|
444
|
+
key = $1
|
|
445
|
+
sub(/\r$/, "", key)
|
|
446
|
+
if (key == n) { print; exit }
|
|
447
|
+
}
|
|
448
|
+
' "$csv"
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
# Apply ASR CSV line to QNN asset (same 6 columns as our CSV; keeps QNN asset_name as row key).
|
|
452
|
+
apply_asr_license_line_to_qnn_asset() {
|
|
453
|
+
local qnn_asset="$1"
|
|
454
|
+
local line="$2"
|
|
455
|
+
line="${line%$'\r'}"
|
|
456
|
+
local asr_asset license_type commercial_use confidence detection_source license_file remainder
|
|
457
|
+
IFS=',' read -r asr_asset license_type commercial_use confidence detection_source license_file remainder <<< "$line"
|
|
458
|
+
if [[ -n "${remainder:-}" ]]; then
|
|
459
|
+
license_file="${license_file},${remainder}"
|
|
460
|
+
fi
|
|
461
|
+
license_type="${license_type%\"}"; license_type="${license_type#\"}"
|
|
462
|
+
commercial_use="${commercial_use%\"}"; commercial_use="${commercial_use#\"}"
|
|
463
|
+
confidence="${confidence%\"}"; confidence="${confidence#\"}"
|
|
464
|
+
detection_source="${detection_source%\"}"; detection_source="${detection_source#\"}"
|
|
465
|
+
license_file="${license_file%\"}"; license_file="${license_file#\"}"
|
|
466
|
+
existing_license_type["$qnn_asset"]="$license_type"
|
|
467
|
+
existing_commercial_use["$qnn_asset"]="$commercial_use"
|
|
468
|
+
existing_confidence["$qnn_asset"]="$confidence"
|
|
469
|
+
existing_detection_source["$qnn_asset"]="$detection_source"
|
|
470
|
+
existing_license_file["$qnn_asset"]="$license_file"
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
# Set on successful try_qnn_asr_license_fallback (do not capture that function in $(…): subshell drops assoc-array updates).
|
|
474
|
+
_QNN_ASR_MIRROR_MATCHED=""
|
|
475
|
+
|
|
476
|
+
try_qnn_asr_license_fallback() {
|
|
477
|
+
local asset_name="$1"
|
|
478
|
+
local derived cand row matched_asr=""
|
|
479
|
+
local -a cands=()
|
|
480
|
+
local -A tried=()
|
|
481
|
+
_QNN_ASR_MIRROR_MATCHED=""
|
|
482
|
+
qnn_license_fallback_context "$asset_name" || return 1
|
|
483
|
+
derived="$(strip_qnn_binary_asset_prefix "$asset_name")" || return 1
|
|
484
|
+
cands+=("$derived")
|
|
485
|
+
if [[ "$derived" != sherpa-onnx-* ]]; then
|
|
486
|
+
cands+=("sherpa-onnx-${derived}")
|
|
487
|
+
fi
|
|
488
|
+
if [[ "$derived" == *-int8.tar.bz2 ]]; then
|
|
489
|
+
cands+=("sherpa-onnx-${derived%-int8.tar.bz2}.tar.bz2")
|
|
490
|
+
fi
|
|
491
|
+
row=""
|
|
492
|
+
for cand in "${cands[@]}"; do
|
|
493
|
+
[[ -z "$cand" || -n "${tried["$cand"]:-}" ]] && continue
|
|
494
|
+
tried["$cand"]=1
|
|
495
|
+
row="$(asr_license_csv_row_for_asset_name "$ASR_LICENSE_CSV" "$cand")"
|
|
496
|
+
if [[ -n "$row" ]]; then
|
|
497
|
+
matched_asr="$cand"
|
|
498
|
+
break
|
|
499
|
+
fi
|
|
500
|
+
done
|
|
501
|
+
[[ -n "$row" ]] || return 1
|
|
502
|
+
apply_asr_license_line_to_qnn_asset "$asset_name" "$row"
|
|
503
|
+
_QNN_ASR_MIRROR_MATCHED="$matched_asr"
|
|
504
|
+
return 0
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
detect_license() {
|
|
508
|
+
local t="$1"
|
|
509
|
+
t="$(echo "$t" | tr '[:upper:]' '[:lower:]' | tr -s ' \r\n\t' ' ')"
|
|
510
|
+
|
|
511
|
+
if [[ "$t" == *"cc0"* || "$t" == *"cc-0"* || "$t" == *"creative commons zero"* || "$t" == *"public domain dedication"* ]]; then echo "cc0|yes|high"
|
|
512
|
+
elif [[ "$t" == *"apache-2.0"* || "$t" == *"apache 2.0"* ]]; then echo "apache-2.0|yes|high"
|
|
513
|
+
elif [[ "$t" == *"apache license 2.0"* ]]; then echo "apache-2.0|yes|high"
|
|
514
|
+
elif [[ "$t" == *"apache license"* && "$t" == *"version 2.0"* ]]; then echo "apache-2.0|yes|high"
|
|
515
|
+
elif [[ "$t" == "mit" || "$t" == *"mit license"* ]]; then echo "mit|yes|high"
|
|
516
|
+
elif [[ "$t" == *"bsd 3-clause"* || ( "$t" == *"redistribution and use in source and binary forms"* && "$t" == *"neither the name"* ) ]]; then echo "bsd-3-clause|yes|medium"
|
|
517
|
+
elif [[ "$t" == *"bsd 2-clause"* ]]; then echo "bsd-2-clause|yes|medium"
|
|
518
|
+
elif [[ "$t" == *"mozilla public license"* && "$t" == *"2.0"* ]]; then echo "mpl-2.0|yes|high"
|
|
519
|
+
elif [[ "$t" == *"isc license"* ]]; then echo "isc|yes|medium"
|
|
520
|
+
elif [[ "$t" == *"the unlicense"* ]]; then echo "unlicense|yes|medium"
|
|
521
|
+
elif [[ "$t" == *"zlib license"* ]]; then echo "zlib|yes|medium"
|
|
522
|
+
elif [[ "$t" == *"gnu affero general public license"* ]]; then echo "agpl-3.0|conditional|high"
|
|
523
|
+
elif [[ "$t" == *"gnu lesser general public license"* ]]; then
|
|
524
|
+
if [[ "$t" == *"version 2.1"* ]]; then echo "lgpl-2.1|conditional|high"
|
|
525
|
+
elif [[ "$t" == *"version 3"* ]]; then echo "lgpl-3.0|conditional|high"
|
|
526
|
+
else echo "lgpl|conditional|medium"; fi
|
|
527
|
+
elif [[ "$t" == *"gnu general public license"* ]]; then
|
|
528
|
+
if [[ "$t" == *"version 3"* ]]; then echo "gpl-3.0|conditional|high"
|
|
529
|
+
elif [[ "$t" == *"version 2"* ]]; then echo "gpl-2.0|conditional|high"
|
|
530
|
+
else echo "gpl|conditional|medium"; fi
|
|
531
|
+
elif [[ "$t" == *"creative commons"* && "$t" == *"noncommercial"* ]]; then
|
|
532
|
+
if [[ "$t" == *"4.0"* ]]; then echo "cc-by-nc-4.0|no|high"
|
|
533
|
+
else echo "cc-by-nc|no|medium"; fi
|
|
534
|
+
elif [[ "$t" == *"creative commons attribution 4.0"* || ( "$t" == *"creative commons"* && "$t" == *"attribution"* && "$t" == *"4.0"* ) ]]; then echo "cc-by-4.0|yes|high"
|
|
535
|
+
elif [[ "$t" == *"non-commercial"* || "$t" == *"non commercial"* ]]; then echo "custom-non-commercial|no|medium"
|
|
536
|
+
elif [[ "$t" == *"research only"* || "$t" == *"for research purposes only"* ]]; then echo "custom-research-only|no|medium"
|
|
537
|
+
else echo "unknown|unknown|low"
|
|
538
|
+
fi
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
for asset_name in "${release_assets[@]}"; do
|
|
542
|
+
url="${asset_urls["$asset_name"]}"
|
|
543
|
+
|
|
544
|
+
l_type="${existing_license_type["$asset_name"]:-}"
|
|
545
|
+
l_type="$(echo -n "$l_type" | xargs)"
|
|
546
|
+
l_type_lc="$(echo -n "$l_type" | tr '[:upper:]' '[:lower:]')"
|
|
547
|
+
c_use="${existing_commercial_use["$asset_name"]:-}"
|
|
548
|
+
c_use="$(echo -n "$c_use" | xargs)"
|
|
549
|
+
c_use_lc="$(echo -n "$c_use" | tr '[:upper:]' '[:lower:]')"
|
|
550
|
+
det_src="${existing_detection_source["$asset_name"]:-}"
|
|
551
|
+
det_src="$(echo -n "$det_src" | xargs)"
|
|
552
|
+
det_src_lc="$(echo -n "$det_src" | tr '[:upper:]' '[:lower:]')"
|
|
553
|
+
if [[ "$det_src_lc" == "manual" ]]; then
|
|
554
|
+
echo " $asset_name — skip (detection_source=manual)"
|
|
555
|
+
continue
|
|
556
|
+
fi
|
|
557
|
+
if [[ "$l_type_lc" == "exhausted" ]]; then
|
|
558
|
+
echo " $asset_name — skip (license_type=exhausted; clear to re-run automation)"
|
|
559
|
+
continue
|
|
560
|
+
fi
|
|
561
|
+
# Only (re)fill when license_type or commercial_use is empty or explicitly unknown.
|
|
562
|
+
if [[ -n "$l_type" && "$l_type_lc" != "unknown" && -n "$c_use" && "$c_use_lc" != "unknown" ]]; then
|
|
563
|
+
echo " $asset_name — skip (license_type and commercial_use already set)"
|
|
564
|
+
continue
|
|
565
|
+
fi
|
|
566
|
+
|
|
567
|
+
if [[ "$asset_name" == *.onnx ]]; then
|
|
568
|
+
set_exhausted "$asset_name"
|
|
569
|
+
echo " $asset_name — .onnx bundle → license_type=$LICENSE_EXHAUSTED (no archive; skipped next run)"
|
|
570
|
+
continue
|
|
571
|
+
fi
|
|
572
|
+
|
|
573
|
+
safe_name="$(get_safe_name "$asset_name")"
|
|
574
|
+
tree_path="${TREE_CACHE_DIR}/${safe_name}.txt"
|
|
575
|
+
|
|
576
|
+
declare -a license_paths=()
|
|
577
|
+
if [[ -f "$tree_path" ]]; then
|
|
578
|
+
declare -A seen_paths=()
|
|
579
|
+
while IFS= read -r line; do
|
|
580
|
+
s="${line%$'\r'}"
|
|
581
|
+
s="$(echo -n "$s" | xargs)"
|
|
582
|
+
if [[ -z "$s" || "$s" == */ ]]; then continue; fi
|
|
583
|
+
|
|
584
|
+
base="${s##*/}"
|
|
585
|
+
base_lower="$(echo -n "$base" | tr '[:upper:]' '[:lower:]')"
|
|
586
|
+
|
|
587
|
+
if [[ -n "${LICENSE_LIKE_BASENAMES["$base_lower"]:-}" ]]; then
|
|
588
|
+
if [[ -z "${seen_paths["$s"]:-}" ]]; then
|
|
589
|
+
license_paths+=("$s")
|
|
590
|
+
seen_paths["$s"]=1
|
|
591
|
+
fi
|
|
592
|
+
elif [[ "$base_lower" == *"license"* || "$base_lower" == *"licence"* ]]; then
|
|
593
|
+
if [[ -z "${seen_paths["$s"]:-}" ]]; then
|
|
594
|
+
license_paths+=("$s")
|
|
595
|
+
seen_paths["$s"]=1
|
|
596
|
+
fi
|
|
597
|
+
fi
|
|
598
|
+
done < "$tree_path"
|
|
599
|
+
unset seen_paths
|
|
600
|
+
fi
|
|
601
|
+
|
|
602
|
+
if [[ ${#license_paths[@]} -eq 0 ]]; then
|
|
603
|
+
if try_hf_model_card_fallback "$asset_name"; then
|
|
604
|
+
echo " $asset_name — no license in tree → filled from $(log_license_fallback_source "$asset_name") (license_type=${existing_license_type["$asset_name"]})"
|
|
605
|
+
continue
|
|
606
|
+
fi
|
|
607
|
+
if try_qnn_asr_license_fallback "$asset_name"; then
|
|
608
|
+
echo " $asset_name — no license in tree + HF exhausted → QNN mirror from asr row (${_QNN_ASR_MIRROR_MATCHED}) (license_type=${existing_license_type["$asset_name"]})"
|
|
609
|
+
continue
|
|
610
|
+
fi
|
|
611
|
+
set_exhausted "$asset_name"
|
|
612
|
+
echo " $asset_name — no license in tree + fallbacks exhausted → license_type=$LICENSE_EXHAUSTED"
|
|
613
|
+
continue
|
|
614
|
+
fi
|
|
615
|
+
|
|
616
|
+
echo " $asset_name — found ${#license_paths[@]} license-like path(s), downloading archive…"
|
|
617
|
+
td="$(mktemp -d -t model-license-XXXXXX)"
|
|
618
|
+
archive_path="${td}/${safe_name}"
|
|
619
|
+
|
|
620
|
+
_curl_dl=(-sSL)
|
|
621
|
+
if [[ -n "$_GH_TOKEN" && "$url" == *"github.com"* ]]; then
|
|
622
|
+
_curl_dl+=(-H "Authorization: Bearer ${_GH_TOKEN}" -H "Accept: application/octet-stream")
|
|
623
|
+
fi
|
|
624
|
+
if ! curl "${_curl_dl[@]}" -o "$archive_path" "$url"; then
|
|
625
|
+
rm -rf "$td"
|
|
626
|
+
if try_hf_model_card_fallback "$asset_name"; then
|
|
627
|
+
echo " $asset_name — download failed → filled from $(log_license_fallback_source "$asset_name") (license_type=${existing_license_type["$asset_name"]})"
|
|
628
|
+
continue
|
|
629
|
+
fi
|
|
630
|
+
if try_qnn_asr_license_fallback "$asset_name"; then
|
|
631
|
+
echo " $asset_name — download failed + HF exhausted → QNN mirror from asr row (${_QNN_ASR_MIRROR_MATCHED}) (license_type=${existing_license_type["$asset_name"]})"
|
|
632
|
+
continue
|
|
633
|
+
fi
|
|
634
|
+
set_exhausted "$asset_name"
|
|
635
|
+
echo " $asset_name — download failed + fallbacks exhausted → license_type=$LICENSE_EXHAUSTED"
|
|
636
|
+
continue
|
|
637
|
+
fi
|
|
638
|
+
|
|
639
|
+
extracted_text=""
|
|
640
|
+
used_file="${license_paths[0]}"
|
|
641
|
+
for p in "${license_paths[@]}"; do
|
|
642
|
+
c1="$p"
|
|
643
|
+
c2=""
|
|
644
|
+
c3=""
|
|
645
|
+
if [[ "$p" == ./* ]]; then
|
|
646
|
+
c2="${p:2}"
|
|
647
|
+
else
|
|
648
|
+
c3="./$p"
|
|
649
|
+
fi
|
|
650
|
+
|
|
651
|
+
for c in "$c1" "$c2" "$c3"; do
|
|
652
|
+
if [[ -z "$c" ]]; then continue; fi
|
|
653
|
+
# Avoid bash "ignored null byte" from $(...) and cap size (wrong member / binary).
|
|
654
|
+
out="$(
|
|
655
|
+
tar -xOf "$archive_path" "$c" 2>/dev/null | head -c 524288 | tr -d '\000' || true
|
|
656
|
+
)"
|
|
657
|
+
if [[ -n "$out" ]]; then
|
|
658
|
+
extracted_text="$out"
|
|
659
|
+
used_file="$p"
|
|
660
|
+
break 2
|
|
661
|
+
fi
|
|
662
|
+
done
|
|
663
|
+
done
|
|
664
|
+
|
|
665
|
+
if [[ -z "$extracted_text" ]]; then
|
|
666
|
+
rm -rf "$td"
|
|
667
|
+
if try_hf_model_card_fallback "$asset_name"; then
|
|
668
|
+
echo " $asset_name — could not extract license file → filled from $(log_license_fallback_source "$asset_name") (license_type=${existing_license_type["$asset_name"]})"
|
|
669
|
+
continue
|
|
670
|
+
fi
|
|
671
|
+
if try_qnn_asr_license_fallback "$asset_name"; then
|
|
672
|
+
echo " $asset_name — could not extract license + HF exhausted → QNN mirror from asr row (${_QNN_ASR_MIRROR_MATCHED}) (license_type=${existing_license_type["$asset_name"]})"
|
|
673
|
+
continue
|
|
674
|
+
fi
|
|
675
|
+
set_exhausted "$asset_name"
|
|
676
|
+
echo " $asset_name — could not extract license file + fallbacks exhausted → license_type=$LICENSE_EXHAUSTED"
|
|
677
|
+
continue
|
|
678
|
+
fi
|
|
679
|
+
|
|
680
|
+
det="$(detect_license "$extracted_text")"
|
|
681
|
+
l_res="$(echo "$det" | cut -d'|' -f1)"
|
|
682
|
+
c_res="$(echo "$det" | cut -d'|' -f2)"
|
|
683
|
+
conf_res="$(echo "$det" | cut -d'|' -f3)"
|
|
684
|
+
|
|
685
|
+
rm -rf "$td"
|
|
686
|
+
|
|
687
|
+
if [[ "$l_res" == "unknown" ]]; then
|
|
688
|
+
if try_hf_model_card_fallback "$asset_name"; then
|
|
689
|
+
echo " $asset_name — archive license text unknown → filled from $(log_license_fallback_source "$asset_name") (license_type=${existing_license_type["$asset_name"]})"
|
|
690
|
+
continue
|
|
691
|
+
fi
|
|
692
|
+
if try_qnn_asr_license_fallback "$asset_name"; then
|
|
693
|
+
echo " $asset_name — archive text unknown + HF exhausted → QNN mirror from asr row (${_QNN_ASR_MIRROR_MATCHED}) (license_type=${existing_license_type["$asset_name"]})"
|
|
694
|
+
continue
|
|
695
|
+
fi
|
|
696
|
+
set_exhausted "$asset_name"
|
|
697
|
+
echo " $asset_name — archive text unclassified + fallbacks exhausted → license_type=$LICENSE_EXHAUSTED"
|
|
698
|
+
continue
|
|
699
|
+
fi
|
|
700
|
+
|
|
701
|
+
set_detected "$asset_name" "$l_res" "$c_res" "$conf_res" "$used_file"
|
|
702
|
+
echo " $asset_name — detected license_type=$l_res commercial_use=$c_res confidence=$conf_res file=$used_file"
|
|
703
|
+
done
|
|
704
|
+
|
|
705
|
+
echo "--- writing CSV ---"
|
|
706
|
+
mkdir -p "$(dirname "$CSV_FILE")"
|
|
707
|
+
echo "asset_name,license_type,commercial_use,confidence,detection_source,license_file" > "$CSV_FILE"
|
|
708
|
+
|
|
709
|
+
declare -A out_seen=()
|
|
710
|
+
for name in "${release_assets[@]}"; do
|
|
711
|
+
if [[ -z "${out_seen["$name"]:-}" ]]; then
|
|
712
|
+
echo "${name},${existing_license_type["$name"]:-},${existing_commercial_use["$name"]:-},${existing_confidence["$name"]:-},${existing_detection_source["$name"]:-},${existing_license_file["$name"]:-}" >> "$CSV_FILE"
|
|
713
|
+
out_seen["$name"]=1
|
|
714
|
+
fi
|
|
715
|
+
done
|
|
716
|
+
|
|
717
|
+
declare -a remaining=()
|
|
718
|
+
# Empty assoc: ${!existing_asset_name[@]} can trip `set -u` on some Bash builds.
|
|
719
|
+
declare -a existing_asset_keys=()
|
|
720
|
+
set +u
|
|
721
|
+
existing_asset_keys=("${!existing_asset_name[@]}")
|
|
722
|
+
set -u
|
|
723
|
+
for name in "${existing_asset_keys[@]}"; do
|
|
724
|
+
if [[ -z "${out_seen["$name"]:-}" ]]; then
|
|
725
|
+
remaining+=("$name")
|
|
726
|
+
fi
|
|
727
|
+
done
|
|
728
|
+
|
|
729
|
+
if [[ ${#remaining[@]} -gt 0 ]]; then
|
|
730
|
+
echo "Appending ${#remaining[@]} asset(s) present in CSV but not in current release asset list."
|
|
731
|
+
mapfile -t remaining_sorted < <(printf "%s\n" "${remaining[@]}" | sort)
|
|
732
|
+
for name in "${remaining_sorted[@]}"; do
|
|
733
|
+
echo "${name},${existing_license_type["$name"]:-},${existing_commercial_use["$name"]:-},${existing_confidence["$name"]:-},${existing_detection_source["$name"]:-},${existing_license_file["$name"]:-}" >> "$CSV_FILE"
|
|
734
|
+
done
|
|
735
|
+
fi
|
|
736
|
+
|
|
737
|
+
out_lines=$(wc -l < "$CSV_FILE" | tr -d ' ')
|
|
738
|
+
echo "Done. Wrote $CSV_FILE ($out_lines lines including header)."
|
|
739
|
+
|
|
740
|
+
# Keep Android and iOS bundled copies identical (paths relative to repo root).
|
|
741
|
+
# When --csv already points at the Android path, skip copying onto itself (cp errors on same file).
|
|
742
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
743
|
+
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
744
|
+
if [[ -d "$REPO_ROOT/android" && -d "$REPO_ROOT/ios" && -f "$CSV_FILE" ]]; then
|
|
745
|
+
_bn="$(basename "$CSV_FILE")"
|
|
746
|
+
_android_dir="$REPO_ROOT/android/src/main/assets/model_licenses"
|
|
747
|
+
_ios_dir="$REPO_ROOT/ios/Resources/model_licenses"
|
|
748
|
+
_android_target="$_android_dir/$_bn"
|
|
749
|
+
_ios_target="$_ios_dir/$_bn"
|
|
750
|
+
mkdir -p "$_android_dir" "$_ios_dir"
|
|
751
|
+
same_canonical_path() {
|
|
752
|
+
local a="$1" b="$2"
|
|
753
|
+
local ca cb
|
|
754
|
+
ca="$(cd "$(dirname "$a")" && pwd)/$(basename "$a")"
|
|
755
|
+
cb="$(cd "$(dirname "$b")" && pwd)/$(basename "$b")"
|
|
756
|
+
[[ "$ca" == "$cb" ]]
|
|
757
|
+
}
|
|
758
|
+
if ! same_canonical_path "$CSV_FILE" "$_android_target"; then
|
|
759
|
+
cp "$CSV_FILE" "$_android_target"
|
|
760
|
+
fi
|
|
761
|
+
if ! same_canonical_path "$CSV_FILE" "$_ios_target"; then
|
|
762
|
+
cp "$CSV_FILE" "$_ios_target"
|
|
763
|
+
fi
|
|
764
|
+
echo "Synced $_bn → android/src/main/assets/model_licenses/ and ios/Resources/model_licenses/"
|
|
765
|
+
fi
|