only_ever_generator 8.4.6 → 8.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bootstrap/app.d.ts +6 -3
- package/dist/bootstrap/app.d.ts.map +1 -1
- package/dist/bootstrap/app.js +11 -17
- package/dist/bootstrap/app.js.map +1 -1
- package/dist/card_gen/generate_cards.d.ts +2 -3
- package/dist/card_gen/generate_cards.d.ts.map +1 -1
- package/dist/card_gen/generate_cards.js +22 -15
- package/dist/card_gen/generate_cards.js.map +1 -1
- package/dist/constants/prompt_data.d.ts +4 -4
- package/dist/constants/prompt_data.js +302 -302
- package/dist/constants/prompts/card_gen_prompt.js +160 -160
- package/dist/constants/prompts/typology_prompt.js +131 -131
- package/dist/constants/source_data.d.ts +171 -171
- package/dist/constants/source_data.js +973 -973
- package/dist/embedding_generation/local_consolidation.js +104 -104
- package/dist/helper/build_concept_facts_schema.d.ts +42 -42
- package/dist/helper/build_concept_facts_schema.js +44 -44
- package/dist/helper/qdrant_db_methods.d.ts.map +1 -1
- package/dist/helper/schema_helper/build_card_schema.d.ts +1 -9
- package/dist/helper/schema_helper/build_card_schema.d.ts.map +1 -1
- package/dist/helper/schema_helper/build_card_schema.js +47 -50
- package/dist/helper/schema_helper/build_card_schema.js.map +1 -1
- package/dist/helper/schema_helper/build_concept_facts_schema.d.ts +1 -1
- package/dist/helper/schema_helper/build_concept_facts_schema.d.ts.map +1 -1
- package/dist/helper/schema_helper/build_concept_facts_schema.js +20 -5
- package/dist/helper/schema_helper/build_concept_facts_schema.js.map +1 -1
- package/dist/helper/schema_helper/build_summary_schema.d.ts +1 -1
- package/dist/helper/schema_helper/build_summary_schema.d.ts.map +1 -1
- package/dist/helper/schema_helper/build_summary_schema.js +18 -7
- package/dist/helper/schema_helper/build_summary_schema.js.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5 -5
- package/dist/index.js.map +1 -1
- package/dist/parse/response_format_card.d.ts +176 -176
- package/dist/parse/response_format_card.js +371 -371
- package/dist/parse/response_format_typology.d.ts +1 -1
- package/dist/parse/response_format_typology.js +46 -46
- package/dist/services/get_prompts.d.ts +8 -7
- package/dist/services/get_prompts.d.ts.map +1 -1
- package/dist/services/get_prompts.js +69 -21
- package/dist/services/get_prompts.js.map +1 -1
- package/dist/typology_gen/generate_concept_facts.d.ts +2 -3
- package/dist/typology_gen/generate_concept_facts.d.ts.map +1 -1
- package/dist/typology_gen/generate_concept_facts.js +25 -15
- package/dist/typology_gen/generate_concept_facts.js.map +1 -1
- package/dist/typology_gen/generate_typology.d.ts +2 -1
- package/dist/typology_gen/generate_typology.d.ts.map +1 -1
- package/dist/typology_gen/generate_typology.js +24 -13
- package/dist/typology_gen/generate_typology.js.map +1 -1
- package/dist/typology_gen/summarize.d.ts +2 -3
- package/dist/typology_gen/summarize.d.ts.map +1 -1
- package/dist/typology_gen/summarize.js +24 -13
- package/dist/typology_gen/summarize.js.map +1 -1
- package/package.json +39 -38
- package/src/bootstrap/app.ts +418 -416
- package/src/card_gen/generate_cards.ts +347 -345
- package/src/config.ts +11 -11
- package/src/constants/api_constants.ts +7 -7
- package/src/constants/prompts/card_gen_prompt.ts +164 -164
- package/src/constants/prompts/typology_prompt.ts +139 -139
- package/src/embedding_generation/consolidation/global_consolidation.ts +96 -96
- package/src/embedding_generation/consolidation/local_consolidation.ts +141 -141
- package/src/embedding_generation/consolidation/write_consolidated_data.ts +98 -98
- package/src/embedding_generation/generate_embeddings.ts +42 -42
- package/src/embedding_generation/parse_embedding_response.ts +31 -31
- package/src/enums/card_type_enum.ts +6 -6
- package/src/gap_fill/calculate_gap_fill.ts +50 -50
- package/src/helper/get_id_from_title.ts +33 -33
- package/src/helper/mongo_helper.ts +29 -29
- package/src/helper/openai_helper.ts +20 -20
- package/src/helper/qdrant_db_methods.ts +77 -77
- package/src/helper/schema_helper/build_card_schema.ts +74 -98
- package/src/helper/schema_helper/build_classify_summarize_schema.ts +43 -43
- package/src/helper/schema_helper/build_concept_facts_schema.ts +45 -31
- package/src/helper/schema_helper/build_summary_schema.ts +43 -32
- package/src/index.ts +71 -73
- package/src/logger.ts +65 -65
- package/src/parse/parse_card/parse_cloze_card.ts +146 -146
- package/src/parse/parse_card/parse_flash_cards.ts +42 -42
- package/src/parse/parse_card/parse_match_card.ts +104 -104
- package/src/parse/parse_card/parse_mcq_card.ts +114 -114
- package/src/parse/parse_card_response.ts +197 -197
- package/src/parse/parse_source_content.ts +212 -212
- package/src/services/get_prompts.ts +164 -112
- package/src/services/open_ai_service.ts +89 -89
- package/src/services/qdrant_service.ts +10 -10
- package/src/types/base_param_type.ts +13 -13
- package/src/types/mongo_concept_fact_type.ts +12 -12
- package/src/types/parsed_card_type.ts +39 -39
- package/src/types/raw_card_response_types/generated_card_response_type.ts +59 -59
- package/src/types/source_taxonomy_type.ts +24 -24
- package/src/typology-parsed-response.ts +1932 -1932
- package/src/typology_gen/generate_concept_facts.ts +180 -169
- package/src/typology_gen/generate_typology.ts +203 -189
- package/src/typology_gen/summarize.ts +176 -164
- package/src/utils/distributed_quote_restoration.ts +80 -80
- package/src/utils/generate_args.ts +29 -29
- package/src/utils/parse_openai_response.ts +19 -19
- package/src/utils/sanitize_strings.ts +65 -65
- package/tsconfig.json +16 -16
- package/dist/constants/default_generation_variables.d.ts +0 -3
- package/dist/constants/default_generation_variables.d.ts.map +0 -1
- package/dist/constants/default_generation_variables.js +0 -580
- package/dist/constants/default_generation_variables.js.map +0 -1
- package/dist/services/prompts_test.d.ts +0 -10
- package/dist/services/prompts_test.d.ts.map +0 -1
- package/dist/services/prompts_test.js +0 -227
- package/dist/services/prompts_test.js.map +0 -1
- package/dist/types/generation_variables_schema.d.ts +0 -14
- package/dist/types/generation_variables_schema.d.ts.map +0 -1
- package/dist/types/generation_variables_schema.js +0 -3
- package/dist/types/generation_variables_schema.js.map +0 -1
- package/dist/utils/test.d.ts +0 -2
- package/dist/utils/test.d.ts.map +0 -1
- package/dist/utils/test.js +0 -5
- package/dist/utils/test.js.map +0 -1
- package/src/constants/default_generation_variables.ts +0 -624
- package/src/types/generation_variables_schema.ts +0 -16
|
@@ -1,212 +1,212 @@
|
|
|
1
|
-
import { SourceTaxonomy } from "../types/source_taxonomy_type";
|
|
2
|
-
|
|
3
|
-
export class ParseSourceContent {
|
|
4
|
-
public content: any;
|
|
5
|
-
/// Format of Content
|
|
6
|
-
// content: {
|
|
7
|
-
// title: source.title,
|
|
8
|
-
// headings: source.headings,
|
|
9
|
-
// content: source.content,
|
|
10
|
-
// fields: fields,
|
|
11
|
-
// taxonomy: source.source_taxonomy,
|
|
12
|
-
// type: source.source_type
|
|
13
|
-
// },
|
|
14
|
-
|
|
15
|
-
titles_to_remove = [
|
|
16
|
-
"See also",
|
|
17
|
-
"References",
|
|
18
|
-
"Further reading",
|
|
19
|
-
"External links",
|
|
20
|
-
"Notes and references",
|
|
21
|
-
"Bibliography",
|
|
22
|
-
"Notes",
|
|
23
|
-
"Cited sources",
|
|
24
|
-
];
|
|
25
|
-
block_types_toremove = ["table", "empty_line"];
|
|
26
|
-
constructor(sourceContent: any) {
|
|
27
|
-
this.content = sourceContent;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
parseData(): {
|
|
31
|
-
source_id: string;
|
|
32
|
-
type: string;
|
|
33
|
-
title: string;
|
|
34
|
-
content: any[];
|
|
35
|
-
headings: string[];
|
|
36
|
-
taxonomy: SourceTaxonomy;
|
|
37
|
-
} {
|
|
38
|
-
let sourceType = this.content.type;
|
|
39
|
-
let afterSanitized;
|
|
40
|
-
if (sourceType == "video") {
|
|
41
|
-
afterSanitized = this.parseVideoContent(this.content.content);
|
|
42
|
-
} else {
|
|
43
|
-
let dataAfterRemovingUnWantedBlocks = this.removeSectionsByTitle(
|
|
44
|
-
this.content.content
|
|
45
|
-
);
|
|
46
|
-
afterSanitized = this.sanitizeBlocks(dataAfterRemovingUnWantedBlocks);
|
|
47
|
-
}
|
|
48
|
-
return {
|
|
49
|
-
source_id: this.content.source_id,
|
|
50
|
-
type: this.content.type,
|
|
51
|
-
title: this.content.title,
|
|
52
|
-
content: afterSanitized,
|
|
53
|
-
headings: this.content.headings,
|
|
54
|
-
taxonomy: this.content.taxonomy,
|
|
55
|
-
};
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
removeSectionsByTitle(data: Array<any>) {
|
|
59
|
-
let dataAfterRemoving = [];
|
|
60
|
-
for (let elem of data) {
|
|
61
|
-
if (
|
|
62
|
-
elem.block_type == "heading" &&
|
|
63
|
-
this.titles_to_remove.includes(elem.content)
|
|
64
|
-
) {
|
|
65
|
-
continue;
|
|
66
|
-
}
|
|
67
|
-
/// remove unwanted blcok types , for now `table` and `empty_line`
|
|
68
|
-
if (this.block_types_toremove.includes(elem.block_type)) {
|
|
69
|
-
continue;
|
|
70
|
-
}
|
|
71
|
-
if (elem.children) {
|
|
72
|
-
elem.children = this.removeSectionsByTitle(elem.children);
|
|
73
|
-
}
|
|
74
|
-
dataAfterRemoving.push(elem);
|
|
75
|
-
}
|
|
76
|
-
return dataAfterRemoving;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
sanitizeTextContent(content: String) {
|
|
80
|
-
// Remove newline characters
|
|
81
|
-
content = content.replace(/\\n/g, " ");
|
|
82
|
-
|
|
83
|
-
// Remove internal link references, keeping only the link text
|
|
84
|
-
// Pattern explanation: [[link|text|index|wiki]] --> text
|
|
85
|
-
content = content.replace(/\[\[.*?\|(.*?)\|.*?\|wiki\]\]/g, "$1");
|
|
86
|
-
|
|
87
|
-
// Remove external links, keeping only the link text
|
|
88
|
-
// Pattern explanation: [url text] --> text
|
|
89
|
-
content = content.replace(/\[http[s]?:\/\/[^\s]+ ([^\]]+)\]/g, "$1");
|
|
90
|
-
|
|
91
|
-
// Remove Markdown link references, keeping only the link text
|
|
92
|
-
// Pattern explanation:  --> link text
|
|
93
|
-
content = content.replace(/\!\[([^\]]+)\]\([^\)]+\)/g, "$1");
|
|
94
|
-
|
|
95
|
-
return content;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
sanitizeBlocks(blocks: Array<any>) {
|
|
99
|
-
let sanitizedBlocks = <any>[];
|
|
100
|
-
blocks = blocks.filter((item) => item.block_type != "table");
|
|
101
|
-
blocks.forEach((block) => {
|
|
102
|
-
let sanitizedBlock: any = {};
|
|
103
|
-
for (let key in block) {
|
|
104
|
-
let value = block[key];
|
|
105
|
-
if (typeof value === "string") {
|
|
106
|
-
sanitizedBlock[key] = this.sanitizeTextContent(value);
|
|
107
|
-
} else if (Array.isArray(value)) {
|
|
108
|
-
sanitizedBlock[key] = this.sanitizeBlocks(value);
|
|
109
|
-
} else {
|
|
110
|
-
sanitizedBlock[key] = value;
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
sanitizedBlocks.push(sanitizedBlock);
|
|
114
|
-
});
|
|
115
|
-
return sanitizedBlocks;
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
parseVideoContent(data: Array<any>) {
|
|
119
|
-
let finalChapters: Array<any> = [];
|
|
120
|
-
// let cleanedData = this.cleanTranscript(timeCodes);
|
|
121
|
-
data.forEach((e) => {
|
|
122
|
-
let combinedContent = this.cleanTranscript(e);
|
|
123
|
-
finalChapters.push({
|
|
124
|
-
startTime: e.startTime,
|
|
125
|
-
endTime: e.endTime,
|
|
126
|
-
content: combinedContent,
|
|
127
|
-
title: e.content,
|
|
128
|
-
});
|
|
129
|
-
});
|
|
130
|
-
|
|
131
|
-
return finalChapters;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
// remove content inside [] which denotes non-speech sounds
|
|
135
|
-
isNonSpeech(content: string) {
|
|
136
|
-
// Check if the content is non-speech (enclosed in square brackets).
|
|
137
|
-
return /^\[.*\]$/.test(content.trim());
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
// remove non-essential content
|
|
141
|
-
cleanTranscript(data: any) {
|
|
142
|
-
let finalContent = "";
|
|
143
|
-
let children = data.children ?? [];
|
|
144
|
-
|
|
145
|
-
children.forEach((e: any) => {
|
|
146
|
-
let content = (e.content || "").trim();
|
|
147
|
-
|
|
148
|
-
if (this.isNonSpeech(content)) return;
|
|
149
|
-
|
|
150
|
-
content = content.replace(/\s+/g, " ");
|
|
151
|
-
finalContent += content;
|
|
152
|
-
});
|
|
153
|
-
|
|
154
|
-
return finalContent;
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
// collapse the timecode to 30 seconds
|
|
158
|
-
collapseTimeCodes(data: Array<any>, maxDuration = 30.0) {
|
|
159
|
-
// Collapse time codes into buckets of approximately maxDuration seconds.
|
|
160
|
-
const collapsedData = [];
|
|
161
|
-
let bucketStartTime: number | null = null;
|
|
162
|
-
let bucketEndTime: number | null = null;
|
|
163
|
-
let bucketContent: Array<any> = [];
|
|
164
|
-
let bucketDuration = 0.0;
|
|
165
|
-
|
|
166
|
-
data.forEach((entry) => {
|
|
167
|
-
const startTime = entry.start_time;
|
|
168
|
-
const endTime = entry.end_time;
|
|
169
|
-
const content = entry.content;
|
|
170
|
-
const entryDuration = endTime - startTime;
|
|
171
|
-
|
|
172
|
-
if (bucketStartTime === null) {
|
|
173
|
-
// Start a new bucket
|
|
174
|
-
bucketStartTime = startTime;
|
|
175
|
-
bucketEndTime = endTime;
|
|
176
|
-
bucketContent.push(content);
|
|
177
|
-
bucketDuration = entryDuration;
|
|
178
|
-
} else if (bucketDuration + entryDuration <= maxDuration) {
|
|
179
|
-
// Add to current bucket
|
|
180
|
-
bucketEndTime = endTime;
|
|
181
|
-
bucketContent.push(content);
|
|
182
|
-
bucketDuration += entryDuration;
|
|
183
|
-
} else {
|
|
184
|
-
// Close current bucket and start a new one
|
|
185
|
-
const collapsedEntry = {
|
|
186
|
-
start_time: bucketStartTime,
|
|
187
|
-
end_time: bucketEndTime,
|
|
188
|
-
content: bucketContent.join(" "),
|
|
189
|
-
};
|
|
190
|
-
collapsedData.push(collapsedEntry);
|
|
191
|
-
|
|
192
|
-
// Start new bucket with current entry
|
|
193
|
-
bucketStartTime = startTime;
|
|
194
|
-
bucketEndTime = endTime;
|
|
195
|
-
bucketContent = [content];
|
|
196
|
-
bucketDuration = entryDuration;
|
|
197
|
-
}
|
|
198
|
-
});
|
|
199
|
-
|
|
200
|
-
// Add the last bucket if it exists
|
|
201
|
-
if (bucketContent.length > 0) {
|
|
202
|
-
const collapsedEntry = {
|
|
203
|
-
start_time: bucketStartTime,
|
|
204
|
-
end_time: bucketEndTime,
|
|
205
|
-
content: bucketContent.join(" "),
|
|
206
|
-
};
|
|
207
|
-
collapsedData.push(collapsedEntry);
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
return collapsedData;
|
|
211
|
-
}
|
|
212
|
-
}
|
|
1
|
+
import { SourceTaxonomy } from "../types/source_taxonomy_type";
|
|
2
|
+
|
|
3
|
+
export class ParseSourceContent {
|
|
4
|
+
public content: any;
|
|
5
|
+
/// Format of Content
|
|
6
|
+
// content: {
|
|
7
|
+
// title: source.title,
|
|
8
|
+
// headings: source.headings,
|
|
9
|
+
// content: source.content,
|
|
10
|
+
// fields: fields,
|
|
11
|
+
// taxonomy: source.source_taxonomy,
|
|
12
|
+
// type: source.source_type
|
|
13
|
+
// },
|
|
14
|
+
|
|
15
|
+
titles_to_remove = [
|
|
16
|
+
"See also",
|
|
17
|
+
"References",
|
|
18
|
+
"Further reading",
|
|
19
|
+
"External links",
|
|
20
|
+
"Notes and references",
|
|
21
|
+
"Bibliography",
|
|
22
|
+
"Notes",
|
|
23
|
+
"Cited sources",
|
|
24
|
+
];
|
|
25
|
+
block_types_toremove = ["table", "empty_line"];
|
|
26
|
+
constructor(sourceContent: any) {
|
|
27
|
+
this.content = sourceContent;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
parseData(): {
|
|
31
|
+
source_id: string;
|
|
32
|
+
type: string;
|
|
33
|
+
title: string;
|
|
34
|
+
content: any[];
|
|
35
|
+
headings: string[];
|
|
36
|
+
taxonomy: SourceTaxonomy;
|
|
37
|
+
} {
|
|
38
|
+
let sourceType = this.content.type;
|
|
39
|
+
let afterSanitized;
|
|
40
|
+
if (sourceType == "video") {
|
|
41
|
+
afterSanitized = this.parseVideoContent(this.content.content);
|
|
42
|
+
} else {
|
|
43
|
+
let dataAfterRemovingUnWantedBlocks = this.removeSectionsByTitle(
|
|
44
|
+
this.content.content
|
|
45
|
+
);
|
|
46
|
+
afterSanitized = this.sanitizeBlocks(dataAfterRemovingUnWantedBlocks);
|
|
47
|
+
}
|
|
48
|
+
return {
|
|
49
|
+
source_id: this.content.source_id,
|
|
50
|
+
type: this.content.type,
|
|
51
|
+
title: this.content.title,
|
|
52
|
+
content: afterSanitized,
|
|
53
|
+
headings: this.content.headings,
|
|
54
|
+
taxonomy: this.content.taxonomy,
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
removeSectionsByTitle(data: Array<any>) {
|
|
59
|
+
let dataAfterRemoving = [];
|
|
60
|
+
for (let elem of data) {
|
|
61
|
+
if (
|
|
62
|
+
elem.block_type == "heading" &&
|
|
63
|
+
this.titles_to_remove.includes(elem.content)
|
|
64
|
+
) {
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
/// remove unwanted blcok types , for now `table` and `empty_line`
|
|
68
|
+
if (this.block_types_toremove.includes(elem.block_type)) {
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
if (elem.children) {
|
|
72
|
+
elem.children = this.removeSectionsByTitle(elem.children);
|
|
73
|
+
}
|
|
74
|
+
dataAfterRemoving.push(elem);
|
|
75
|
+
}
|
|
76
|
+
return dataAfterRemoving;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
sanitizeTextContent(content: String) {
|
|
80
|
+
// Remove newline characters
|
|
81
|
+
content = content.replace(/\\n/g, " ");
|
|
82
|
+
|
|
83
|
+
// Remove internal link references, keeping only the link text
|
|
84
|
+
// Pattern explanation: [[link|text|index|wiki]] --> text
|
|
85
|
+
content = content.replace(/\[\[.*?\|(.*?)\|.*?\|wiki\]\]/g, "$1");
|
|
86
|
+
|
|
87
|
+
// Remove external links, keeping only the link text
|
|
88
|
+
// Pattern explanation: [url text] --> text
|
|
89
|
+
content = content.replace(/\[http[s]?:\/\/[^\s]+ ([^\]]+)\]/g, "$1");
|
|
90
|
+
|
|
91
|
+
// Remove Markdown link references, keeping only the link text
|
|
92
|
+
// Pattern explanation:  --> link text
|
|
93
|
+
content = content.replace(/\!\[([^\]]+)\]\([^\)]+\)/g, "$1");
|
|
94
|
+
|
|
95
|
+
return content;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
sanitizeBlocks(blocks: Array<any>) {
|
|
99
|
+
let sanitizedBlocks = <any>[];
|
|
100
|
+
blocks = blocks.filter((item) => item.block_type != "table");
|
|
101
|
+
blocks.forEach((block) => {
|
|
102
|
+
let sanitizedBlock: any = {};
|
|
103
|
+
for (let key in block) {
|
|
104
|
+
let value = block[key];
|
|
105
|
+
if (typeof value === "string") {
|
|
106
|
+
sanitizedBlock[key] = this.sanitizeTextContent(value);
|
|
107
|
+
} else if (Array.isArray(value)) {
|
|
108
|
+
sanitizedBlock[key] = this.sanitizeBlocks(value);
|
|
109
|
+
} else {
|
|
110
|
+
sanitizedBlock[key] = value;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
sanitizedBlocks.push(sanitizedBlock);
|
|
114
|
+
});
|
|
115
|
+
return sanitizedBlocks;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
parseVideoContent(data: Array<any>) {
|
|
119
|
+
let finalChapters: Array<any> = [];
|
|
120
|
+
// let cleanedData = this.cleanTranscript(timeCodes);
|
|
121
|
+
data.forEach((e) => {
|
|
122
|
+
let combinedContent = this.cleanTranscript(e);
|
|
123
|
+
finalChapters.push({
|
|
124
|
+
startTime: e.startTime,
|
|
125
|
+
endTime: e.endTime,
|
|
126
|
+
content: combinedContent,
|
|
127
|
+
title: e.content,
|
|
128
|
+
});
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
return finalChapters;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// remove content inside [] which denotes non-speech sounds
|
|
135
|
+
isNonSpeech(content: string) {
|
|
136
|
+
// Check if the content is non-speech (enclosed in square brackets).
|
|
137
|
+
return /^\[.*\]$/.test(content.trim());
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// remove non-essential content
|
|
141
|
+
cleanTranscript(data: any) {
|
|
142
|
+
let finalContent = "";
|
|
143
|
+
let children = data.children ?? [];
|
|
144
|
+
|
|
145
|
+
children.forEach((e: any) => {
|
|
146
|
+
let content = (e.content || "").trim();
|
|
147
|
+
|
|
148
|
+
if (this.isNonSpeech(content)) return;
|
|
149
|
+
|
|
150
|
+
content = content.replace(/\s+/g, " ");
|
|
151
|
+
finalContent += content;
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
return finalContent;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// collapse the timecode to 30 seconds
|
|
158
|
+
collapseTimeCodes(data: Array<any>, maxDuration = 30.0) {
|
|
159
|
+
// Collapse time codes into buckets of approximately maxDuration seconds.
|
|
160
|
+
const collapsedData = [];
|
|
161
|
+
let bucketStartTime: number | null = null;
|
|
162
|
+
let bucketEndTime: number | null = null;
|
|
163
|
+
let bucketContent: Array<any> = [];
|
|
164
|
+
let bucketDuration = 0.0;
|
|
165
|
+
|
|
166
|
+
data.forEach((entry) => {
|
|
167
|
+
const startTime = entry.start_time;
|
|
168
|
+
const endTime = entry.end_time;
|
|
169
|
+
const content = entry.content;
|
|
170
|
+
const entryDuration = endTime - startTime;
|
|
171
|
+
|
|
172
|
+
if (bucketStartTime === null) {
|
|
173
|
+
// Start a new bucket
|
|
174
|
+
bucketStartTime = startTime;
|
|
175
|
+
bucketEndTime = endTime;
|
|
176
|
+
bucketContent.push(content);
|
|
177
|
+
bucketDuration = entryDuration;
|
|
178
|
+
} else if (bucketDuration + entryDuration <= maxDuration) {
|
|
179
|
+
// Add to current bucket
|
|
180
|
+
bucketEndTime = endTime;
|
|
181
|
+
bucketContent.push(content);
|
|
182
|
+
bucketDuration += entryDuration;
|
|
183
|
+
} else {
|
|
184
|
+
// Close current bucket and start a new one
|
|
185
|
+
const collapsedEntry = {
|
|
186
|
+
start_time: bucketStartTime,
|
|
187
|
+
end_time: bucketEndTime,
|
|
188
|
+
content: bucketContent.join(" "),
|
|
189
|
+
};
|
|
190
|
+
collapsedData.push(collapsedEntry);
|
|
191
|
+
|
|
192
|
+
// Start new bucket with current entry
|
|
193
|
+
bucketStartTime = startTime;
|
|
194
|
+
bucketEndTime = endTime;
|
|
195
|
+
bucketContent = [content];
|
|
196
|
+
bucketDuration = entryDuration;
|
|
197
|
+
}
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
// Add the last bucket if it exists
|
|
201
|
+
if (bucketContent.length > 0) {
|
|
202
|
+
const collapsedEntry = {
|
|
203
|
+
start_time: bucketStartTime,
|
|
204
|
+
end_time: bucketEndTime,
|
|
205
|
+
content: bucketContent.join(" "),
|
|
206
|
+
};
|
|
207
|
+
collapsedData.push(collapsedEntry);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return collapsedData;
|
|
211
|
+
}
|
|
212
|
+
}
|