label-studio-converter 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +33 -0
- package/README.md +351 -0
- package/dist/bash-complete.cjs +1296 -0
- package/dist/bash-complete.cjs.map +1 -0
- package/dist/bash-complete.d.cts +1 -0
- package/dist/bash-complete.d.ts +1 -0
- package/dist/bash-complete.js +1279 -0
- package/dist/bash-complete.js.map +1 -0
- package/dist/cli.cjs +1281 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +1264 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +418 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +309 -0
- package/dist/index.d.ts +309 -0
- package/dist/index.js +377 -0
- package/dist/index.js.map +1 -0
- package/package.json +78 -0
package/dist/cli.js
ADDED
|
@@ -0,0 +1,1264 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
4
|
+
var __esm = (fn, res) => function __init() {
|
|
5
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
6
|
+
};
|
|
7
|
+
var __export = (target, all) => {
|
|
8
|
+
for (var name in all)
|
|
9
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
// node_modules/.pnpm/tsup@8.5.1_jiti@2.4.2_postcss@8.5.6_typescript@5.9.3_yaml@2.8.2/node_modules/tsup/assets/esm_shims.js
|
|
13
|
+
import path from "path";
|
|
14
|
+
import { fileURLToPath } from "url";
|
|
15
|
+
var init_esm_shims = __esm({
|
|
16
|
+
"node_modules/.pnpm/tsup@8.5.1_jiti@2.4.2_postcss@8.5.6_typescript@5.9.3_yaml@2.8.2/node_modules/tsup/assets/esm_shims.js"() {
|
|
17
|
+
"use strict";
|
|
18
|
+
}
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
// src/constants.ts
|
|
22
|
+
var OUTPUT_BASE_DIR, DEFAULT_LABEL_NAME, DEFAULT_LABEL_STUDIO_FULL_JSON, DEFAULT_CREATE_FILE_PER_IMAGE, DEFAULT_CREATE_FILE_LIST_FOR_SERVING, DEFAULT_FILE_LIST_NAME, DEFAULT_BASE_SERVER_URL, DEFAULT_PPOCR_FILE_NAME, SORT_VERTICAL_NONE, SORT_VERTICAL_TOP_BOTTOM, SORT_VERTICAL_BOTTOM_TOP, DEFAULT_SORT_VERTICAL, SORT_HORIZONTAL_NONE, SORT_HORIZONTAL_LTR, SORT_HORIZONTAL_RTL, DEFAULT_SORT_HORIZONTAL;
|
|
23
|
+
var init_constants = __esm({
|
|
24
|
+
"src/constants.ts"() {
|
|
25
|
+
"use strict";
|
|
26
|
+
init_esm_shims();
|
|
27
|
+
OUTPUT_BASE_DIR = "./output";
|
|
28
|
+
DEFAULT_LABEL_NAME = "Text";
|
|
29
|
+
DEFAULT_LABEL_STUDIO_FULL_JSON = true;
|
|
30
|
+
DEFAULT_CREATE_FILE_PER_IMAGE = false;
|
|
31
|
+
DEFAULT_CREATE_FILE_LIST_FOR_SERVING = true;
|
|
32
|
+
DEFAULT_FILE_LIST_NAME = "files.txt";
|
|
33
|
+
DEFAULT_BASE_SERVER_URL = "http://localhost:8081";
|
|
34
|
+
DEFAULT_PPOCR_FILE_NAME = "Label.txt";
|
|
35
|
+
SORT_VERTICAL_NONE = "none";
|
|
36
|
+
SORT_VERTICAL_TOP_BOTTOM = "top-bottom";
|
|
37
|
+
SORT_VERTICAL_BOTTOM_TOP = "bottom-top";
|
|
38
|
+
DEFAULT_SORT_VERTICAL = SORT_VERTICAL_NONE;
|
|
39
|
+
SORT_HORIZONTAL_NONE = "none";
|
|
40
|
+
SORT_HORIZONTAL_LTR = "ltr";
|
|
41
|
+
SORT_HORIZONTAL_RTL = "rtl";
|
|
42
|
+
DEFAULT_SORT_HORIZONTAL = SORT_HORIZONTAL_NONE;
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
// src/lib/ppocr-label.ts
|
|
47
|
+
import { randomUUID } from "crypto";
|
|
48
|
+
import { existsSync, readFileSync } from "fs";
|
|
49
|
+
import { join } from "path";
|
|
50
|
+
import sizeOf from "image-size";
|
|
51
|
+
var ppocrToLabelStudio, ppocrToFullLabelStudio, ppocrToMinLabelStudio;
|
|
52
|
+
var init_ppocr_label = __esm({
|
|
53
|
+
"src/lib/ppocr-label.ts"() {
|
|
54
|
+
"use strict";
|
|
55
|
+
init_esm_shims();
|
|
56
|
+
init_constants();
|
|
57
|
+
ppocrToLabelStudio = async (data, options) => {
|
|
58
|
+
const {
|
|
59
|
+
imagePath,
|
|
60
|
+
baseServerUrl,
|
|
61
|
+
inputDir,
|
|
62
|
+
toFullJson = true,
|
|
63
|
+
taskId = 1,
|
|
64
|
+
labelName = DEFAULT_LABEL_NAME
|
|
65
|
+
} = options || {};
|
|
66
|
+
if (toFullJson) {
|
|
67
|
+
return ppocrToFullLabelStudio(
|
|
68
|
+
data,
|
|
69
|
+
imagePath,
|
|
70
|
+
baseServerUrl,
|
|
71
|
+
inputDir,
|
|
72
|
+
taskId,
|
|
73
|
+
labelName
|
|
74
|
+
);
|
|
75
|
+
} else {
|
|
76
|
+
return ppocrToMinLabelStudio(
|
|
77
|
+
data,
|
|
78
|
+
imagePath,
|
|
79
|
+
baseServerUrl,
|
|
80
|
+
inputDir,
|
|
81
|
+
labelName
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
ppocrToFullLabelStudio = (data, imagePath, baseServerUrl, inputDir, taskId = 1, labelName = DEFAULT_LABEL_NAME) => {
|
|
86
|
+
const newBaseServerUrl = baseServerUrl.replace(/\/+$/, "") + (baseServerUrl === "" ? "" : "/");
|
|
87
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
88
|
+
let original_width = 1920;
|
|
89
|
+
let original_height = 1080;
|
|
90
|
+
const resolvedImagePath = inputDir ? join(inputDir, imagePath) : imagePath;
|
|
91
|
+
if (!existsSync(resolvedImagePath)) {
|
|
92
|
+
throw new Error(`Image file not found: ${resolvedImagePath}`);
|
|
93
|
+
}
|
|
94
|
+
const buffer = readFileSync(resolvedImagePath);
|
|
95
|
+
const dimensions = sizeOf(buffer);
|
|
96
|
+
if (!dimensions.width || !dimensions.height) {
|
|
97
|
+
throw new Error(
|
|
98
|
+
`Failed to read image dimensions from: ${resolvedImagePath}`
|
|
99
|
+
);
|
|
100
|
+
}
|
|
101
|
+
original_width = dimensions.width;
|
|
102
|
+
original_height = dimensions.height;
|
|
103
|
+
const fileName = imagePath.split("/").pop() || imagePath;
|
|
104
|
+
const result = [
|
|
105
|
+
{
|
|
106
|
+
id: taskId,
|
|
107
|
+
annotations: [
|
|
108
|
+
{
|
|
109
|
+
id: taskId,
|
|
110
|
+
completed_by: 1,
|
|
111
|
+
result: data.map((item) => {
|
|
112
|
+
const { points } = item;
|
|
113
|
+
const annotationId = randomUUID().slice(0, 10);
|
|
114
|
+
const polygonPoints = points.map(([x, y]) => [
|
|
115
|
+
(x ?? 0) / original_width * 100,
|
|
116
|
+
(y ?? 0) / original_height * 100
|
|
117
|
+
]);
|
|
118
|
+
return [
|
|
119
|
+
// 1. Polygon geometry only
|
|
120
|
+
{
|
|
121
|
+
original_width,
|
|
122
|
+
original_height,
|
|
123
|
+
image_rotation: 0,
|
|
124
|
+
value: {
|
|
125
|
+
points: polygonPoints,
|
|
126
|
+
closed: true
|
|
127
|
+
},
|
|
128
|
+
id: annotationId,
|
|
129
|
+
from_name: "poly",
|
|
130
|
+
to_name: "image",
|
|
131
|
+
type: "polygon",
|
|
132
|
+
origin: "manual"
|
|
133
|
+
},
|
|
134
|
+
// 2. Labels with polygon geometry
|
|
135
|
+
{
|
|
136
|
+
original_width,
|
|
137
|
+
original_height,
|
|
138
|
+
image_rotation: 0,
|
|
139
|
+
value: {
|
|
140
|
+
points: polygonPoints,
|
|
141
|
+
closed: true,
|
|
142
|
+
labels: [labelName]
|
|
143
|
+
},
|
|
144
|
+
id: annotationId,
|
|
145
|
+
from_name: "label",
|
|
146
|
+
to_name: "image",
|
|
147
|
+
type: "labels",
|
|
148
|
+
origin: "manual"
|
|
149
|
+
},
|
|
150
|
+
// 3. Textarea with polygon geometry and text
|
|
151
|
+
{
|
|
152
|
+
original_width,
|
|
153
|
+
original_height,
|
|
154
|
+
image_rotation: 0,
|
|
155
|
+
value: {
|
|
156
|
+
points: polygonPoints,
|
|
157
|
+
closed: true,
|
|
158
|
+
text: [item.transcription]
|
|
159
|
+
},
|
|
160
|
+
id: annotationId,
|
|
161
|
+
from_name: "transcription",
|
|
162
|
+
to_name: "image",
|
|
163
|
+
type: "textarea",
|
|
164
|
+
origin: "manual"
|
|
165
|
+
}
|
|
166
|
+
];
|
|
167
|
+
}).flat(),
|
|
168
|
+
was_cancelled: false,
|
|
169
|
+
ground_truth: false,
|
|
170
|
+
created_at: now,
|
|
171
|
+
updated_at: now,
|
|
172
|
+
draft_created_at: now,
|
|
173
|
+
lead_time: 0,
|
|
174
|
+
prediction: {},
|
|
175
|
+
result_count: data.length * 3,
|
|
176
|
+
unique_id: randomUUID(),
|
|
177
|
+
import_id: null,
|
|
178
|
+
last_action: null,
|
|
179
|
+
bulk_created: false,
|
|
180
|
+
task: taskId,
|
|
181
|
+
project: 1,
|
|
182
|
+
updated_by: 1,
|
|
183
|
+
parent_prediction: null,
|
|
184
|
+
parent_annotation: null,
|
|
185
|
+
last_created_by: null
|
|
186
|
+
}
|
|
187
|
+
],
|
|
188
|
+
file_upload: fileName,
|
|
189
|
+
drafts: [],
|
|
190
|
+
predictions: [],
|
|
191
|
+
data: { ocr: `${newBaseServerUrl}${imagePath}` },
|
|
192
|
+
meta: {},
|
|
193
|
+
created_at: now,
|
|
194
|
+
updated_at: now,
|
|
195
|
+
allow_skip: false,
|
|
196
|
+
inner_id: taskId,
|
|
197
|
+
total_annotations: 1,
|
|
198
|
+
cancelled_annotations: 0,
|
|
199
|
+
total_predictions: 0,
|
|
200
|
+
comment_count: 0,
|
|
201
|
+
unresolved_comment_count: 0,
|
|
202
|
+
last_comment_updated_at: null,
|
|
203
|
+
project: 1,
|
|
204
|
+
updated_by: 1,
|
|
205
|
+
comment_authors: []
|
|
206
|
+
}
|
|
207
|
+
];
|
|
208
|
+
return result;
|
|
209
|
+
};
|
|
210
|
+
ppocrToMinLabelStudio = (data, imagePath, baseServerUrl, inputDir, labelName = "text") => {
|
|
211
|
+
const newBaseServerUrl = baseServerUrl.replace(/\/+$/, "") + (baseServerUrl === "" ? "" : "/");
|
|
212
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
213
|
+
let original_width = 1920;
|
|
214
|
+
let original_height = 1080;
|
|
215
|
+
const resolvedImagePath = inputDir ? join(inputDir, imagePath) : imagePath;
|
|
216
|
+
if (!existsSync(resolvedImagePath)) {
|
|
217
|
+
throw new Error(`Image file not found: ${resolvedImagePath}`);
|
|
218
|
+
}
|
|
219
|
+
const buffer = readFileSync(resolvedImagePath);
|
|
220
|
+
const dimensions = sizeOf(buffer);
|
|
221
|
+
if (!dimensions.width || !dimensions.height) {
|
|
222
|
+
throw new Error(
|
|
223
|
+
`Failed to read image dimensions from: ${resolvedImagePath}`
|
|
224
|
+
);
|
|
225
|
+
}
|
|
226
|
+
original_width = dimensions.width;
|
|
227
|
+
original_height = dimensions.height;
|
|
228
|
+
return data.map((item, index) => {
|
|
229
|
+
const { points } = item;
|
|
230
|
+
let minX = Infinity;
|
|
231
|
+
let minY = Infinity;
|
|
232
|
+
let maxX = -Infinity;
|
|
233
|
+
let maxY = -Infinity;
|
|
234
|
+
for (const point of points) {
|
|
235
|
+
const [x, y] = point;
|
|
236
|
+
if (x !== void 0 && y !== void 0) {
|
|
237
|
+
minX = Math.min(minX, x);
|
|
238
|
+
minY = Math.min(minY, y);
|
|
239
|
+
maxX = Math.max(maxX, x);
|
|
240
|
+
maxY = Math.max(maxY, y);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
const width = maxX - minX;
|
|
244
|
+
const height = maxY - minY;
|
|
245
|
+
return {
|
|
246
|
+
ocr: encodeURI(`${newBaseServerUrl}${imagePath}`),
|
|
247
|
+
id: index + 1,
|
|
248
|
+
bbox: [
|
|
249
|
+
{
|
|
250
|
+
x: minX,
|
|
251
|
+
y: minY,
|
|
252
|
+
width,
|
|
253
|
+
height,
|
|
254
|
+
rotation: 0,
|
|
255
|
+
original_width,
|
|
256
|
+
original_height
|
|
257
|
+
}
|
|
258
|
+
],
|
|
259
|
+
label: [
|
|
260
|
+
{
|
|
261
|
+
points,
|
|
262
|
+
closed: true,
|
|
263
|
+
labels: [labelName],
|
|
264
|
+
original_width,
|
|
265
|
+
original_height
|
|
266
|
+
}
|
|
267
|
+
],
|
|
268
|
+
transcription: [item.transcription],
|
|
269
|
+
poly: [
|
|
270
|
+
{
|
|
271
|
+
points,
|
|
272
|
+
closed: true,
|
|
273
|
+
original_width,
|
|
274
|
+
original_height
|
|
275
|
+
}
|
|
276
|
+
],
|
|
277
|
+
annotator: 1,
|
|
278
|
+
annotation_id: index + 1,
|
|
279
|
+
created_at: now,
|
|
280
|
+
updated_at: now,
|
|
281
|
+
lead_time: 0
|
|
282
|
+
};
|
|
283
|
+
});
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
// src/lib/schema.ts
|
|
289
|
+
import z from "zod";
|
|
290
|
+
var FullOCRLabelStudioSchema, MinOCRLabelStudioSchema, PPOCRLabelSchema;
|
|
291
|
+
var init_schema = __esm({
|
|
292
|
+
"src/lib/schema.ts"() {
|
|
293
|
+
"use strict";
|
|
294
|
+
init_esm_shims();
|
|
295
|
+
FullOCRLabelStudioSchema = z.array(
|
|
296
|
+
z.object({
|
|
297
|
+
id: z.number(),
|
|
298
|
+
annotations: z.array(
|
|
299
|
+
z.object({
|
|
300
|
+
id: z.number(),
|
|
301
|
+
completed_by: z.number(),
|
|
302
|
+
result: z.array(
|
|
303
|
+
z.union([
|
|
304
|
+
z.object({
|
|
305
|
+
original_width: z.number(),
|
|
306
|
+
original_height: z.number(),
|
|
307
|
+
image_rotation: z.number(),
|
|
308
|
+
value: z.object({
|
|
309
|
+
x: z.number(),
|
|
310
|
+
y: z.number(),
|
|
311
|
+
width: z.number(),
|
|
312
|
+
height: z.number(),
|
|
313
|
+
rotation: z.number()
|
|
314
|
+
}),
|
|
315
|
+
id: z.string(),
|
|
316
|
+
from_name: z.string(),
|
|
317
|
+
to_name: z.string(),
|
|
318
|
+
type: z.string(),
|
|
319
|
+
origin: z.string()
|
|
320
|
+
}),
|
|
321
|
+
z.object({
|
|
322
|
+
original_width: z.number(),
|
|
323
|
+
original_height: z.number(),
|
|
324
|
+
image_rotation: z.number(),
|
|
325
|
+
value: z.object({
|
|
326
|
+
x: z.number(),
|
|
327
|
+
y: z.number(),
|
|
328
|
+
width: z.number(),
|
|
329
|
+
height: z.number(),
|
|
330
|
+
rotation: z.number(),
|
|
331
|
+
labels: z.array(z.string())
|
|
332
|
+
}),
|
|
333
|
+
id: z.string(),
|
|
334
|
+
from_name: z.string(),
|
|
335
|
+
to_name: z.string(),
|
|
336
|
+
type: z.string(),
|
|
337
|
+
origin: z.string()
|
|
338
|
+
}),
|
|
339
|
+
z.object({
|
|
340
|
+
original_width: z.number(),
|
|
341
|
+
original_height: z.number(),
|
|
342
|
+
image_rotation: z.number(),
|
|
343
|
+
value: z.object({
|
|
344
|
+
x: z.number(),
|
|
345
|
+
y: z.number(),
|
|
346
|
+
width: z.number(),
|
|
347
|
+
height: z.number(),
|
|
348
|
+
rotation: z.number(),
|
|
349
|
+
text: z.array(z.string())
|
|
350
|
+
}),
|
|
351
|
+
id: z.string(),
|
|
352
|
+
from_name: z.string(),
|
|
353
|
+
to_name: z.string(),
|
|
354
|
+
type: z.string(),
|
|
355
|
+
origin: z.string()
|
|
356
|
+
}),
|
|
357
|
+
z.object({
|
|
358
|
+
original_width: z.number(),
|
|
359
|
+
original_height: z.number(),
|
|
360
|
+
image_rotation: z.number(),
|
|
361
|
+
value: z.object({
|
|
362
|
+
points: z.array(z.array(z.number())),
|
|
363
|
+
closed: z.boolean()
|
|
364
|
+
}),
|
|
365
|
+
id: z.string(),
|
|
366
|
+
from_name: z.string(),
|
|
367
|
+
to_name: z.string(),
|
|
368
|
+
type: z.string(),
|
|
369
|
+
origin: z.string()
|
|
370
|
+
}),
|
|
371
|
+
z.object({
|
|
372
|
+
original_width: z.number(),
|
|
373
|
+
original_height: z.number(),
|
|
374
|
+
image_rotation: z.number(),
|
|
375
|
+
value: z.object({
|
|
376
|
+
points: z.array(z.array(z.number())),
|
|
377
|
+
closed: z.boolean(),
|
|
378
|
+
labels: z.array(z.string())
|
|
379
|
+
}),
|
|
380
|
+
id: z.string(),
|
|
381
|
+
from_name: z.string(),
|
|
382
|
+
to_name: z.string(),
|
|
383
|
+
type: z.string(),
|
|
384
|
+
origin: z.string()
|
|
385
|
+
}),
|
|
386
|
+
z.object({
|
|
387
|
+
original_width: z.number(),
|
|
388
|
+
original_height: z.number(),
|
|
389
|
+
image_rotation: z.number(),
|
|
390
|
+
value: z.object({
|
|
391
|
+
points: z.array(z.array(z.number())),
|
|
392
|
+
closed: z.boolean(),
|
|
393
|
+
text: z.array(z.string())
|
|
394
|
+
}),
|
|
395
|
+
id: z.string(),
|
|
396
|
+
from_name: z.string(),
|
|
397
|
+
to_name: z.string(),
|
|
398
|
+
type: z.string(),
|
|
399
|
+
origin: z.string()
|
|
400
|
+
})
|
|
401
|
+
])
|
|
402
|
+
),
|
|
403
|
+
was_cancelled: z.boolean(),
|
|
404
|
+
ground_truth: z.boolean(),
|
|
405
|
+
created_at: z.string(),
|
|
406
|
+
updated_at: z.string(),
|
|
407
|
+
draft_created_at: z.string(),
|
|
408
|
+
lead_time: z.number(),
|
|
409
|
+
prediction: z.object({}),
|
|
410
|
+
result_count: z.number(),
|
|
411
|
+
unique_id: z.string(),
|
|
412
|
+
import_id: z.null(),
|
|
413
|
+
last_action: z.null(),
|
|
414
|
+
bulk_created: z.boolean(),
|
|
415
|
+
task: z.number(),
|
|
416
|
+
project: z.number(),
|
|
417
|
+
updated_by: z.number(),
|
|
418
|
+
parent_prediction: z.null(),
|
|
419
|
+
parent_annotation: z.null(),
|
|
420
|
+
last_created_by: z.null()
|
|
421
|
+
})
|
|
422
|
+
),
|
|
423
|
+
file_upload: z.string(),
|
|
424
|
+
drafts: z.array(
|
|
425
|
+
z.object({
|
|
426
|
+
id: z.number(),
|
|
427
|
+
user: z.string(),
|
|
428
|
+
created_username: z.string(),
|
|
429
|
+
created_ago: z.string(),
|
|
430
|
+
result: z.array(
|
|
431
|
+
z.union([
|
|
432
|
+
z.object({
|
|
433
|
+
original_width: z.number(),
|
|
434
|
+
original_height: z.number(),
|
|
435
|
+
image_rotation: z.number(),
|
|
436
|
+
value: z.object({
|
|
437
|
+
x: z.number(),
|
|
438
|
+
y: z.number(),
|
|
439
|
+
width: z.number(),
|
|
440
|
+
height: z.number(),
|
|
441
|
+
rotation: z.number()
|
|
442
|
+
}),
|
|
443
|
+
id: z.string(),
|
|
444
|
+
from_name: z.string(),
|
|
445
|
+
to_name: z.string(),
|
|
446
|
+
type: z.string(),
|
|
447
|
+
origin: z.string()
|
|
448
|
+
}),
|
|
449
|
+
z.object({
|
|
450
|
+
original_width: z.number(),
|
|
451
|
+
original_height: z.number(),
|
|
452
|
+
image_rotation: z.number(),
|
|
453
|
+
value: z.object({
|
|
454
|
+
x: z.number(),
|
|
455
|
+
y: z.number(),
|
|
456
|
+
width: z.number(),
|
|
457
|
+
height: z.number(),
|
|
458
|
+
rotation: z.number(),
|
|
459
|
+
labels: z.array(z.string())
|
|
460
|
+
}),
|
|
461
|
+
id: z.string(),
|
|
462
|
+
from_name: z.string(),
|
|
463
|
+
to_name: z.string(),
|
|
464
|
+
type: z.string(),
|
|
465
|
+
origin: z.string()
|
|
466
|
+
}),
|
|
467
|
+
z.object({
|
|
468
|
+
original_width: z.number(),
|
|
469
|
+
original_height: z.number(),
|
|
470
|
+
image_rotation: z.number(),
|
|
471
|
+
value: z.object({
|
|
472
|
+
x: z.number(),
|
|
473
|
+
y: z.number(),
|
|
474
|
+
width: z.number(),
|
|
475
|
+
height: z.number(),
|
|
476
|
+
rotation: z.number(),
|
|
477
|
+
text: z.array(z.string())
|
|
478
|
+
}),
|
|
479
|
+
id: z.string(),
|
|
480
|
+
from_name: z.string(),
|
|
481
|
+
to_name: z.string(),
|
|
482
|
+
type: z.string(),
|
|
483
|
+
origin: z.string()
|
|
484
|
+
}),
|
|
485
|
+
z.object({
|
|
486
|
+
original_width: z.number(),
|
|
487
|
+
original_height: z.number(),
|
|
488
|
+
image_rotation: z.number(),
|
|
489
|
+
value: z.object({
|
|
490
|
+
points: z.array(z.array(z.number())),
|
|
491
|
+
closed: z.boolean()
|
|
492
|
+
}),
|
|
493
|
+
id: z.string(),
|
|
494
|
+
from_name: z.string(),
|
|
495
|
+
to_name: z.string(),
|
|
496
|
+
type: z.string(),
|
|
497
|
+
origin: z.string()
|
|
498
|
+
}),
|
|
499
|
+
z.object({
|
|
500
|
+
original_width: z.number(),
|
|
501
|
+
original_height: z.number(),
|
|
502
|
+
image_rotation: z.number(),
|
|
503
|
+
value: z.object({
|
|
504
|
+
points: z.array(z.array(z.number())),
|
|
505
|
+
closed: z.boolean(),
|
|
506
|
+
labels: z.array(z.string())
|
|
507
|
+
}),
|
|
508
|
+
id: z.string(),
|
|
509
|
+
from_name: z.string(),
|
|
510
|
+
to_name: z.string(),
|
|
511
|
+
type: z.string(),
|
|
512
|
+
origin: z.string()
|
|
513
|
+
}),
|
|
514
|
+
z.object({
|
|
515
|
+
original_width: z.number(),
|
|
516
|
+
original_height: z.number(),
|
|
517
|
+
image_rotation: z.number(),
|
|
518
|
+
value: z.object({
|
|
519
|
+
points: z.array(z.array(z.number())),
|
|
520
|
+
closed: z.boolean(),
|
|
521
|
+
text: z.array(z.string())
|
|
522
|
+
}),
|
|
523
|
+
id: z.string(),
|
|
524
|
+
from_name: z.string(),
|
|
525
|
+
to_name: z.string(),
|
|
526
|
+
type: z.string(),
|
|
527
|
+
origin: z.string()
|
|
528
|
+
})
|
|
529
|
+
])
|
|
530
|
+
),
|
|
531
|
+
lead_time: z.number(),
|
|
532
|
+
was_postponed: z.boolean(),
|
|
533
|
+
import_id: z.null(),
|
|
534
|
+
created_at: z.string(),
|
|
535
|
+
updated_at: z.string(),
|
|
536
|
+
task: z.number(),
|
|
537
|
+
annotation: z.number()
|
|
538
|
+
})
|
|
539
|
+
),
|
|
540
|
+
predictions: z.array(z.unknown()),
|
|
541
|
+
data: z.object({ ocr: z.string() }),
|
|
542
|
+
meta: z.object({}),
|
|
543
|
+
created_at: z.string(),
|
|
544
|
+
updated_at: z.string(),
|
|
545
|
+
allow_skip: z.boolean(),
|
|
546
|
+
inner_id: z.number(),
|
|
547
|
+
total_annotations: z.number(),
|
|
548
|
+
cancelled_annotations: z.number(),
|
|
549
|
+
total_predictions: z.number(),
|
|
550
|
+
comment_count: z.number(),
|
|
551
|
+
unresolved_comment_count: z.number(),
|
|
552
|
+
last_comment_updated_at: z.null(),
|
|
553
|
+
project: z.number(),
|
|
554
|
+
updated_by: z.number(),
|
|
555
|
+
comment_authors: z.array(z.unknown())
|
|
556
|
+
})
|
|
557
|
+
);
|
|
558
|
+
MinOCRLabelStudioSchema = z.array(
|
|
559
|
+
z.object({
|
|
560
|
+
ocr: z.string(),
|
|
561
|
+
id: z.number(),
|
|
562
|
+
bbox: z.array(
|
|
563
|
+
z.object({
|
|
564
|
+
x: z.number(),
|
|
565
|
+
y: z.number(),
|
|
566
|
+
width: z.number(),
|
|
567
|
+
height: z.number(),
|
|
568
|
+
rotation: z.number(),
|
|
569
|
+
original_width: z.number(),
|
|
570
|
+
original_height: z.number()
|
|
571
|
+
})
|
|
572
|
+
),
|
|
573
|
+
label: z.array(
|
|
574
|
+
z.union([
|
|
575
|
+
z.object({
|
|
576
|
+
x: z.number(),
|
|
577
|
+
y: z.number(),
|
|
578
|
+
width: z.number(),
|
|
579
|
+
height: z.number(),
|
|
580
|
+
rotation: z.number(),
|
|
581
|
+
labels: z.array(z.string()),
|
|
582
|
+
original_width: z.number(),
|
|
583
|
+
original_height: z.number()
|
|
584
|
+
}),
|
|
585
|
+
z.object({
|
|
586
|
+
points: z.array(z.array(z.number())),
|
|
587
|
+
closed: z.boolean(),
|
|
588
|
+
labels: z.array(z.string()),
|
|
589
|
+
original_width: z.number(),
|
|
590
|
+
original_height: z.number()
|
|
591
|
+
})
|
|
592
|
+
])
|
|
593
|
+
),
|
|
594
|
+
transcription: z.array(z.string()),
|
|
595
|
+
poly: z.array(
|
|
596
|
+
z.object({
|
|
597
|
+
points: z.array(z.array(z.number())),
|
|
598
|
+
closed: z.boolean(),
|
|
599
|
+
original_width: z.number(),
|
|
600
|
+
original_height: z.number()
|
|
601
|
+
})
|
|
602
|
+
),
|
|
603
|
+
annotator: z.number(),
|
|
604
|
+
annotation_id: z.number(),
|
|
605
|
+
created_at: z.string(),
|
|
606
|
+
updated_at: z.string(),
|
|
607
|
+
lead_time: z.number()
|
|
608
|
+
})
|
|
609
|
+
);
|
|
610
|
+
PPOCRLabelSchema = z.array(
|
|
611
|
+
z.object({
|
|
612
|
+
transcription: z.string(),
|
|
613
|
+
points: z.array(z.array(z.number())),
|
|
614
|
+
dt_score: z.number()
|
|
615
|
+
})
|
|
616
|
+
);
|
|
617
|
+
}
|
|
618
|
+
});
|
|
619
|
+
|
|
620
|
+
// src/lib/sort.ts
|
|
621
|
+
function getBoundingBoxCenter(points) {
|
|
622
|
+
let minX = Infinity;
|
|
623
|
+
let minY = Infinity;
|
|
624
|
+
let maxX = -Infinity;
|
|
625
|
+
let maxY = -Infinity;
|
|
626
|
+
for (const [x, y] of points) {
|
|
627
|
+
if (x !== void 0 && y !== void 0) {
|
|
628
|
+
minX = Math.min(minX, x);
|
|
629
|
+
minY = Math.min(minY, y);
|
|
630
|
+
maxX = Math.max(maxX, x);
|
|
631
|
+
maxY = Math.max(maxY, y);
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
return {
|
|
635
|
+
x: (minX + maxX) / 2,
|
|
636
|
+
y: (minY + maxY) / 2,
|
|
637
|
+
width: maxX - minX,
|
|
638
|
+
height: maxY - minY
|
|
639
|
+
};
|
|
640
|
+
}
|
|
641
|
+
function sortBoundingBoxes(annotations, verticalSort, horizontalSort) {
|
|
642
|
+
if (verticalSort === SORT_VERTICAL_NONE && horizontalSort === SORT_HORIZONTAL_NONE) {
|
|
643
|
+
return annotations;
|
|
644
|
+
}
|
|
645
|
+
const sorted = [...annotations];
|
|
646
|
+
const isVerticalText = sorted.length > 0 && (() => {
|
|
647
|
+
const verticalCount = sorted.filter((ann) => {
|
|
648
|
+
const center = getBoundingBoxCenter(ann.points);
|
|
649
|
+
return center.height > center.width * 1.5;
|
|
650
|
+
}).length;
|
|
651
|
+
return verticalCount > sorted.length / 2;
|
|
652
|
+
})();
|
|
653
|
+
if (horizontalSort === SORT_HORIZONTAL_RTL && verticalSort !== SORT_VERTICAL_NONE && isVerticalText) {
|
|
654
|
+
const annotationsWithCenters = sorted.map((ann) => ({
|
|
655
|
+
annotation: ann,
|
|
656
|
+
center: getBoundingBoxCenter(ann.points)
|
|
657
|
+
}));
|
|
658
|
+
const columns = [];
|
|
659
|
+
for (const item of annotationsWithCenters) {
|
|
660
|
+
let addedToColumn = false;
|
|
661
|
+
for (const column of columns) {
|
|
662
|
+
const avgX = column.reduce((sum, c) => sum + c.center.x, 0) / column.length;
|
|
663
|
+
if (Math.abs(item.center.x - avgX) < GROUPING_TOLERANCE) {
|
|
664
|
+
column.push(item);
|
|
665
|
+
addedToColumn = true;
|
|
666
|
+
break;
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
if (!addedToColumn) {
|
|
670
|
+
columns.push([item]);
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
columns.sort((colA, colB) => {
|
|
674
|
+
const avgXA = colA.reduce((sum, c) => sum + c.center.x, 0) / colA.length;
|
|
675
|
+
const avgXB = colB.reduce((sum, c) => sum + c.center.x, 0) / colB.length;
|
|
676
|
+
return avgXB - avgXA;
|
|
677
|
+
});
|
|
678
|
+
for (const column of columns) {
|
|
679
|
+
column.sort((a, b) => {
|
|
680
|
+
return verticalSort === SORT_VERTICAL_TOP_BOTTOM ? a.center.y - b.center.y : b.center.y - a.center.y;
|
|
681
|
+
});
|
|
682
|
+
}
|
|
683
|
+
return columns.flat().map((item) => item.annotation);
|
|
684
|
+
}
|
|
685
|
+
sorted.sort((a, b) => {
|
|
686
|
+
const centerA = getBoundingBoxCenter(a.points);
|
|
687
|
+
const centerB = getBoundingBoxCenter(b.points);
|
|
688
|
+
if (verticalSort !== SORT_VERTICAL_NONE) {
|
|
689
|
+
const yDiff = verticalSort === SORT_VERTICAL_TOP_BOTTOM ? centerA.y - centerB.y : centerB.y - centerA.y;
|
|
690
|
+
if (Math.abs(yDiff) > GROUPING_TOLERANCE) {
|
|
691
|
+
return yDiff;
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
if (horizontalSort !== SORT_HORIZONTAL_NONE) {
|
|
695
|
+
return horizontalSort === SORT_HORIZONTAL_LTR ? centerA.x - centerB.x : centerB.x - centerA.x;
|
|
696
|
+
}
|
|
697
|
+
return 0;
|
|
698
|
+
});
|
|
699
|
+
return sorted;
|
|
700
|
+
}
|
|
701
|
+
var GROUPING_TOLERANCE;
|
|
702
|
+
var init_sort = __esm({
|
|
703
|
+
"src/lib/sort.ts"() {
|
|
704
|
+
"use strict";
|
|
705
|
+
init_esm_shims();
|
|
706
|
+
init_constants();
|
|
707
|
+
GROUPING_TOLERANCE = 50;
|
|
708
|
+
}
|
|
709
|
+
});
|
|
710
|
+
|
|
711
|
+
// src/commands/toLabelStudio/impl.ts
|
|
712
|
+
var impl_exports = {};
|
|
713
|
+
__export(impl_exports, {
|
|
714
|
+
convertToLabelStudio: () => convertToLabelStudio
|
|
715
|
+
});
|
|
716
|
+
import { mkdir, readFile, readdir, writeFile } from "fs/promises";
|
|
717
|
+
import { join as join2 } from "path";
|
|
718
|
+
import chalk from "chalk";
|
|
719
|
+
async function convertToLabelStudio(flags, ...inputDirs) {
|
|
720
|
+
const {
|
|
721
|
+
outDir = OUTPUT_BASE_DIR,
|
|
722
|
+
defaultLabelName = DEFAULT_LABEL_NAME,
|
|
723
|
+
toFullJson = DEFAULT_LABEL_STUDIO_FULL_JSON,
|
|
724
|
+
createFilePerImage = DEFAULT_CREATE_FILE_PER_IMAGE,
|
|
725
|
+
createFileListForServing = DEFAULT_CREATE_FILE_LIST_FOR_SERVING,
|
|
726
|
+
fileListName = DEFAULT_FILE_LIST_NAME,
|
|
727
|
+
baseServerUrl = DEFAULT_BASE_SERVER_URL,
|
|
728
|
+
sortVertical = DEFAULT_SORT_VERTICAL,
|
|
729
|
+
sortHorizontal = DEFAULT_SORT_HORIZONTAL
|
|
730
|
+
} = flags;
|
|
731
|
+
const newBaseServerUrl = baseServerUrl.replace(/\/+$/, "") + (baseServerUrl === "" ? "" : "/");
|
|
732
|
+
await mkdir(outDir, { recursive: true });
|
|
733
|
+
for (const inputDir of inputDirs) {
|
|
734
|
+
console.log(chalk.blue(`Processing input directory: ${inputDir}`));
|
|
735
|
+
const files = await readdir(inputDir);
|
|
736
|
+
for (const file of files) {
|
|
737
|
+
if (!file.endsWith(".txt")) {
|
|
738
|
+
continue;
|
|
739
|
+
}
|
|
740
|
+
const filePath = join2(inputDir, file);
|
|
741
|
+
console.log(chalk.gray(`Processing file: ${file}`));
|
|
742
|
+
try {
|
|
743
|
+
const fileData = await readFile(filePath, "utf-8");
|
|
744
|
+
const lines = fileData.trim().split("\n");
|
|
745
|
+
const imageDataMap = /* @__PURE__ */ new Map();
|
|
746
|
+
for (const line of lines) {
|
|
747
|
+
const parts = line.split(" ");
|
|
748
|
+
if (parts.length !== 2) {
|
|
749
|
+
throw new Error(`Invalid PPOCRLabelV2 format in line: ${line}`);
|
|
750
|
+
}
|
|
751
|
+
const [imagePath, annotationsStr] = parts;
|
|
752
|
+
const annotations = JSON.parse(annotationsStr);
|
|
753
|
+
PPOCRLabelSchema.parse(annotations);
|
|
754
|
+
imageDataMap.set(imagePath, annotations);
|
|
755
|
+
}
|
|
756
|
+
const allLabelStudioData = [];
|
|
757
|
+
const fileList = [];
|
|
758
|
+
let taskId = 1;
|
|
759
|
+
for (const [imagePath, ppocrData] of imageDataMap.entries()) {
|
|
760
|
+
const sortedPpocrData = sortBoundingBoxes(
|
|
761
|
+
ppocrData,
|
|
762
|
+
sortVertical,
|
|
763
|
+
sortHorizontal
|
|
764
|
+
);
|
|
765
|
+
const finalImagePath = createFileListForServing ? encodeURI(`${newBaseServerUrl}${imagePath}`) : imagePath;
|
|
766
|
+
const labelStudioData = await ppocrToLabelStudio(sortedPpocrData, {
|
|
767
|
+
toFullJson,
|
|
768
|
+
imagePath,
|
|
769
|
+
baseServerUrl: newBaseServerUrl,
|
|
770
|
+
inputDir,
|
|
771
|
+
taskId,
|
|
772
|
+
labelName: defaultLabelName
|
|
773
|
+
});
|
|
774
|
+
if (toFullJson) {
|
|
775
|
+
allLabelStudioData.push(labelStudioData[0]);
|
|
776
|
+
} else {
|
|
777
|
+
allLabelStudioData.push(...labelStudioData);
|
|
778
|
+
}
|
|
779
|
+
if (createFilePerImage) {
|
|
780
|
+
const imageBaseName = imagePath.replace(/\//g, "_").replace(/\.[^.]+$/, "");
|
|
781
|
+
const individualOutputPath = join2(
|
|
782
|
+
outDir,
|
|
783
|
+
`${imageBaseName}_${toFullJson ? "full" : "min"}.json`
|
|
784
|
+
);
|
|
785
|
+
await writeFile(
|
|
786
|
+
individualOutputPath,
|
|
787
|
+
JSON.stringify(
|
|
788
|
+
toFullJson ? labelStudioData[0] : labelStudioData,
|
|
789
|
+
null,
|
|
790
|
+
2
|
|
791
|
+
),
|
|
792
|
+
"utf-8"
|
|
793
|
+
);
|
|
794
|
+
console.log(
|
|
795
|
+
chalk.gray(
|
|
796
|
+
` \u2713 Created individual file: ${individualOutputPath}`
|
|
797
|
+
)
|
|
798
|
+
);
|
|
799
|
+
}
|
|
800
|
+
if (createFileListForServing) {
|
|
801
|
+
fileList.push(finalImagePath);
|
|
802
|
+
}
|
|
803
|
+
taskId++;
|
|
804
|
+
}
|
|
805
|
+
const baseName = file.replace(".txt", "");
|
|
806
|
+
const outputPath = join2(
|
|
807
|
+
outDir,
|
|
808
|
+
`${baseName}_${toFullJson ? "full" : "min"}.json`
|
|
809
|
+
);
|
|
810
|
+
await writeFile(
|
|
811
|
+
outputPath,
|
|
812
|
+
JSON.stringify(allLabelStudioData, null, 2),
|
|
813
|
+
"utf-8"
|
|
814
|
+
);
|
|
815
|
+
console.log(chalk.green(`\u2713 Converted ${file} -> ${outputPath}`));
|
|
816
|
+
if (createFileListForServing && fileList.length > 0) {
|
|
817
|
+
const fileListPath = join2(outDir, fileListName);
|
|
818
|
+
await writeFile(fileListPath, fileList.join("\n"), "utf-8");
|
|
819
|
+
console.log(
|
|
820
|
+
chalk.green(
|
|
821
|
+
`\u2713 Created file list: ${fileListPath} (${fileList.length} files)`
|
|
822
|
+
)
|
|
823
|
+
);
|
|
824
|
+
}
|
|
825
|
+
} catch (error) {
|
|
826
|
+
console.error(
|
|
827
|
+
chalk.red(`\u2717 Failed to process ${file}:`),
|
|
828
|
+
error instanceof Error ? error.message : error
|
|
829
|
+
);
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
console.log(chalk.green("\n\u2713 Conversion completed!"));
|
|
834
|
+
}
|
|
835
|
+
var init_impl = __esm({
|
|
836
|
+
"src/commands/toLabelStudio/impl.ts"() {
|
|
837
|
+
"use strict";
|
|
838
|
+
init_esm_shims();
|
|
839
|
+
init_constants();
|
|
840
|
+
init_ppocr_label();
|
|
841
|
+
init_schema();
|
|
842
|
+
init_sort();
|
|
843
|
+
}
|
|
844
|
+
});
|
|
845
|
+
|
|
846
|
+
// src/lib/label-studio.ts
|
|
847
|
+
import * as turf from "@turf/turf";
|
|
848
|
+
var labelStudioToPPOCR, minLabelStudioToPPOCR;
|
|
849
|
+
var init_label_studio = __esm({
|
|
850
|
+
"src/lib/label-studio.ts"() {
|
|
851
|
+
"use strict";
|
|
852
|
+
init_esm_shims();
|
|
853
|
+
labelStudioToPPOCR = async (data, baseImageDir) => {
|
|
854
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
855
|
+
for (const task of data) {
|
|
856
|
+
let imagePath = task.file_upload || "";
|
|
857
|
+
if (task.data.ocr) {
|
|
858
|
+
const urlPath = task.data.ocr.replace(/^https?:\/\/[^/]+\//, "");
|
|
859
|
+
imagePath = decodeURIComponent(urlPath);
|
|
860
|
+
}
|
|
861
|
+
if (baseImageDir) {
|
|
862
|
+
imagePath = `${baseImageDir}/${task.file_upload || imagePath.split("/").pop() || imagePath}`;
|
|
863
|
+
}
|
|
864
|
+
const imageAnnotations = [];
|
|
865
|
+
for (const annotation of task.annotations) {
|
|
866
|
+
const groupedById = /* @__PURE__ */ new Map();
|
|
867
|
+
for (const resultItem of annotation.result) {
|
|
868
|
+
const { id } = resultItem;
|
|
869
|
+
if (!groupedById.has(id)) {
|
|
870
|
+
groupedById.set(id, []);
|
|
871
|
+
}
|
|
872
|
+
groupedById.get(id).push(resultItem);
|
|
873
|
+
}
|
|
874
|
+
for (const [_, resultItems] of groupedById) {
|
|
875
|
+
let points;
|
|
876
|
+
let transcription = "";
|
|
877
|
+
for (const resultItem of resultItems) {
|
|
878
|
+
if ("points" in resultItem.value && resultItem.value.points) {
|
|
879
|
+
const { points: valuePoints } = resultItem.value;
|
|
880
|
+
const { original_width, original_height } = resultItem;
|
|
881
|
+
points = valuePoints.map(([x, y]) => [
|
|
882
|
+
(x ?? 0) * original_width / 100,
|
|
883
|
+
(y ?? 0) * original_height / 100
|
|
884
|
+
]);
|
|
885
|
+
} else if ("x" in resultItem.value && "y" in resultItem.value && "width" in resultItem.value && "height" in resultItem.value) {
|
|
886
|
+
const { x, y, width, height } = resultItem.value;
|
|
887
|
+
const { original_width, original_height } = resultItem;
|
|
888
|
+
const absX = x * original_width / 100;
|
|
889
|
+
const absY = y * original_height / 100;
|
|
890
|
+
const absWidth = width * original_width / 100;
|
|
891
|
+
const absHeight = height * original_height / 100;
|
|
892
|
+
points = [
|
|
893
|
+
[absX, absY],
|
|
894
|
+
[absX + absWidth, absY],
|
|
895
|
+
[absX + absWidth, absY + absHeight],
|
|
896
|
+
[absX, absY + absHeight]
|
|
897
|
+
];
|
|
898
|
+
}
|
|
899
|
+
if ("text" in resultItem.value && Array.isArray(resultItem.value.text)) {
|
|
900
|
+
transcription = resultItem.value.text[0] || "";
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
if (points && points.length > 0) {
|
|
904
|
+
let dt_score = 1;
|
|
905
|
+
try {
|
|
906
|
+
const firstPoint = points[0];
|
|
907
|
+
if (firstPoint) {
|
|
908
|
+
const polygon2 = turf.polygon([points.concat([firstPoint])]);
|
|
909
|
+
const area2 = turf.area(polygon2);
|
|
910
|
+
dt_score = Math.min(1, Math.max(0.5, area2 / 1e4));
|
|
911
|
+
}
|
|
912
|
+
} catch {
|
|
913
|
+
dt_score = 0.8;
|
|
914
|
+
}
|
|
915
|
+
imageAnnotations.push({
|
|
916
|
+
transcription,
|
|
917
|
+
points,
|
|
918
|
+
dt_score
|
|
919
|
+
});
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
}
|
|
923
|
+
if (imageAnnotations.length > 0) {
|
|
924
|
+
resultMap.set(imagePath, imageAnnotations);
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
return resultMap;
|
|
928
|
+
};
|
|
929
|
+
minLabelStudioToPPOCR = async (data, baseImageDir) => {
|
|
930
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
931
|
+
for (const item of data) {
|
|
932
|
+
let imagePath = item.ocr || "";
|
|
933
|
+
if (imagePath) {
|
|
934
|
+
imagePath = decodeURIComponent(
|
|
935
|
+
imagePath.replace(/^https?:\/\/[^/]+\//, "")
|
|
936
|
+
);
|
|
937
|
+
}
|
|
938
|
+
if (baseImageDir) {
|
|
939
|
+
imagePath = `${baseImageDir}/${imagePath.split("/").pop() || imagePath}`;
|
|
940
|
+
}
|
|
941
|
+
let points;
|
|
942
|
+
if (item.poly.length > 0 && item.poly[0]) {
|
|
943
|
+
const { points: polyPoints } = item.poly[0];
|
|
944
|
+
points = polyPoints;
|
|
945
|
+
} else if (item.bbox.length > 0 && item.bbox[0]) {
|
|
946
|
+
const bbox = item.bbox[0];
|
|
947
|
+
const { x, y, width, height } = bbox;
|
|
948
|
+
points = [
|
|
949
|
+
[x, y],
|
|
950
|
+
[x + width, y],
|
|
951
|
+
[x + width, y + height],
|
|
952
|
+
[x, y + height]
|
|
953
|
+
];
|
|
954
|
+
} else {
|
|
955
|
+
continue;
|
|
956
|
+
}
|
|
957
|
+
const transcription = item.transcription.length > 0 ? item.transcription[0] : "";
|
|
958
|
+
let dt_score = 1;
|
|
959
|
+
try {
|
|
960
|
+
const firstPoint = points[0];
|
|
961
|
+
if (firstPoint) {
|
|
962
|
+
const polygon2 = turf.polygon([points.concat([firstPoint])]);
|
|
963
|
+
const area2 = turf.area(polygon2);
|
|
964
|
+
dt_score = Math.min(1, Math.max(0.5, area2 / 1e4));
|
|
965
|
+
}
|
|
966
|
+
} catch {
|
|
967
|
+
dt_score = 0.8;
|
|
968
|
+
}
|
|
969
|
+
const annotation = {
|
|
970
|
+
transcription: transcription ?? "",
|
|
971
|
+
points,
|
|
972
|
+
dt_score
|
|
973
|
+
};
|
|
974
|
+
if (!resultMap.has(imagePath)) {
|
|
975
|
+
resultMap.set(imagePath, []);
|
|
976
|
+
}
|
|
977
|
+
resultMap.get(imagePath).push(annotation);
|
|
978
|
+
}
|
|
979
|
+
return resultMap;
|
|
980
|
+
};
|
|
981
|
+
}
|
|
982
|
+
});
|
|
983
|
+
|
|
984
|
+
// src/commands/toPPOCR/impl.ts
|
|
985
|
+
var impl_exports2 = {};
|
|
986
|
+
__export(impl_exports2, {
|
|
987
|
+
convertToPPOCR: () => convertToPPOCR
|
|
988
|
+
});
|
|
989
|
+
import { mkdir as mkdir2, readFile as readFile2, readdir as readdir2, writeFile as writeFile2 } from "fs/promises";
|
|
990
|
+
import { join as join3 } from "path";
|
|
991
|
+
import chalk2 from "chalk";
|
|
992
|
+
async function convertToPPOCR(flags, ...inputDirs) {
|
|
993
|
+
const {
|
|
994
|
+
outDir = `${OUTPUT_BASE_DIR}`,
|
|
995
|
+
fileName = DEFAULT_PPOCR_FILE_NAME,
|
|
996
|
+
baseImageDir,
|
|
997
|
+
sortVertical = DEFAULT_SORT_VERTICAL,
|
|
998
|
+
sortHorizontal = DEFAULT_SORT_HORIZONTAL
|
|
999
|
+
} = flags;
|
|
1000
|
+
await mkdir2(outDir, { recursive: true });
|
|
1001
|
+
for (const inputDir of inputDirs) {
|
|
1002
|
+
console.log(chalk2.blue(`Processing input directory: ${inputDir}`));
|
|
1003
|
+
const files = await readdir2(inputDir);
|
|
1004
|
+
for (const file of files) {
|
|
1005
|
+
if (!file.endsWith(".json")) {
|
|
1006
|
+
continue;
|
|
1007
|
+
}
|
|
1008
|
+
const filePath = join3(inputDir, file);
|
|
1009
|
+
console.log(chalk2.gray(`Processing file: ${file}`));
|
|
1010
|
+
try {
|
|
1011
|
+
const fileData = await readFile2(filePath, "utf-8");
|
|
1012
|
+
const labelStudioData = JSON.parse(fileData);
|
|
1013
|
+
const { data, isFull } = isLabelStudioFullJSON(labelStudioData);
|
|
1014
|
+
const ppocrDataMap = isFull ? await labelStudioToPPOCR(data, baseImageDir) : await minLabelStudioToPPOCR(
|
|
1015
|
+
data,
|
|
1016
|
+
baseImageDir
|
|
1017
|
+
);
|
|
1018
|
+
const outputLines = [];
|
|
1019
|
+
for (const [imagePath, annotations] of ppocrDataMap.entries()) {
|
|
1020
|
+
const sortedAnnotations = sortBoundingBoxes(
|
|
1021
|
+
annotations,
|
|
1022
|
+
sortVertical,
|
|
1023
|
+
sortHorizontal
|
|
1024
|
+
);
|
|
1025
|
+
PPOCRLabelSchema.parse(sortedAnnotations);
|
|
1026
|
+
const jsonArray = JSON.stringify(sortedAnnotations);
|
|
1027
|
+
outputLines.push(`${imagePath} ${jsonArray}`);
|
|
1028
|
+
}
|
|
1029
|
+
const baseName = file.replace(".json", "");
|
|
1030
|
+
const outputPath = join3(outDir, `${baseName}_${fileName}`);
|
|
1031
|
+
await writeFile2(outputPath, outputLines.join("\n"), "utf-8");
|
|
1032
|
+
console.log(chalk2.green(`\u2713 Converted ${file} -> ${outputPath}`));
|
|
1033
|
+
} catch (error) {
|
|
1034
|
+
console.error(
|
|
1035
|
+
chalk2.red(`\u2717 Failed to process ${file}:`),
|
|
1036
|
+
error instanceof Error ? error.message : error
|
|
1037
|
+
);
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
console.log(chalk2.green("\n\u2713 Conversion completed!"));
|
|
1042
|
+
}
|
|
1043
|
+
var isLabelStudioFullJSON;
|
|
1044
|
+
var init_impl2 = __esm({
|
|
1045
|
+
"src/commands/toPPOCR/impl.ts"() {
|
|
1046
|
+
"use strict";
|
|
1047
|
+
init_esm_shims();
|
|
1048
|
+
init_constants();
|
|
1049
|
+
init_label_studio();
|
|
1050
|
+
init_schema();
|
|
1051
|
+
init_sort();
|
|
1052
|
+
isLabelStudioFullJSON = (data) => {
|
|
1053
|
+
const parsedFull = FullOCRLabelStudioSchema.safeParse(data);
|
|
1054
|
+
if (parsedFull.success) {
|
|
1055
|
+
return { isFull: true, data: parsedFull.data };
|
|
1056
|
+
}
|
|
1057
|
+
if (!Array.isArray(data) && typeof data === "object" && data !== null) {
|
|
1058
|
+
const parsedSingleFull = FullOCRLabelStudioSchema.safeParse([data]);
|
|
1059
|
+
if (parsedSingleFull.success) {
|
|
1060
|
+
return { isFull: true, data: parsedSingleFull.data };
|
|
1061
|
+
}
|
|
1062
|
+
}
|
|
1063
|
+
const parsedMin = MinOCRLabelStudioSchema.safeParse(data);
|
|
1064
|
+
if (parsedMin.success) {
|
|
1065
|
+
return { isFull: false, data: parsedMin.data };
|
|
1066
|
+
}
|
|
1067
|
+
throw new Error("Input data is not valid Label Studio JSON format.");
|
|
1068
|
+
};
|
|
1069
|
+
}
|
|
1070
|
+
});
|
|
1071
|
+
|
|
1072
|
+
// src/bin/cli.ts
|
|
1073
|
+
init_esm_shims();
|
|
1074
|
+
import { run } from "@stricli/core";
|
|
1075
|
+
|
|
1076
|
+
// src/app.ts
|
|
1077
|
+
init_esm_shims();
|
|
1078
|
+
import {
|
|
1079
|
+
buildInstallCommand,
|
|
1080
|
+
buildUninstallCommand
|
|
1081
|
+
} from "@stricli/auto-complete";
|
|
1082
|
+
import { buildApplication, buildRouteMap } from "@stricli/core";
|
|
1083
|
+
|
|
1084
|
+
// package.json
|
|
1085
|
+
var version = "1.0.0";
|
|
1086
|
+
var description = "Convert between Label Studio OCR format and PPOCRLabelv2 format";
|
|
1087
|
+
|
|
1088
|
+
// src/commands/toLabelStudio/command.ts
|
|
1089
|
+
init_esm_shims();
|
|
1090
|
+
init_constants();
|
|
1091
|
+
import { buildCommand } from "@stricli/core";
|
|
1092
|
+
var toLabelStudioCommand = buildCommand({
|
|
1093
|
+
loader: async () => {
|
|
1094
|
+
const { convertToLabelStudio: convertToLabelStudio2 } = await Promise.resolve().then(() => (init_impl(), impl_exports));
|
|
1095
|
+
return convertToLabelStudio2;
|
|
1096
|
+
},
|
|
1097
|
+
parameters: {
|
|
1098
|
+
positional: {
|
|
1099
|
+
kind: "array",
|
|
1100
|
+
parameter: {
|
|
1101
|
+
brief: "Input directories containing PPOCRLabel files",
|
|
1102
|
+
parse: String
|
|
1103
|
+
},
|
|
1104
|
+
minimum: 1
|
|
1105
|
+
},
|
|
1106
|
+
flags: {
|
|
1107
|
+
outDir: {
|
|
1108
|
+
kind: "parsed",
|
|
1109
|
+
brief: `Output directory. Default to "${OUTPUT_BASE_DIR}"`,
|
|
1110
|
+
parse: String,
|
|
1111
|
+
optional: true
|
|
1112
|
+
},
|
|
1113
|
+
defaultLabelName: {
|
|
1114
|
+
kind: "parsed",
|
|
1115
|
+
brief: `Default label name for text annotations. Default to "${DEFAULT_LABEL_NAME}"`,
|
|
1116
|
+
parse: String,
|
|
1117
|
+
optional: true
|
|
1118
|
+
},
|
|
1119
|
+
toFullJson: {
|
|
1120
|
+
kind: "boolean",
|
|
1121
|
+
brief: `Convert to Full OCR Label Studio format. Default to "${DEFAULT_LABEL_STUDIO_FULL_JSON}"`,
|
|
1122
|
+
optional: true
|
|
1123
|
+
},
|
|
1124
|
+
createFilePerImage: {
|
|
1125
|
+
kind: "boolean",
|
|
1126
|
+
brief: `Create a separate Label Studio JSON file for each image. Default to "${DEFAULT_CREATE_FILE_PER_IMAGE}"`,
|
|
1127
|
+
optional: true
|
|
1128
|
+
},
|
|
1129
|
+
createFileListForServing: {
|
|
1130
|
+
kind: "boolean",
|
|
1131
|
+
brief: `Create a file list for serving in Label Studio. Default to "${DEFAULT_CREATE_FILE_LIST_FOR_SERVING}"`,
|
|
1132
|
+
optional: true
|
|
1133
|
+
},
|
|
1134
|
+
fileListName: {
|
|
1135
|
+
kind: "parsed",
|
|
1136
|
+
brief: `Name of the file list for serving. Default to "${DEFAULT_FILE_LIST_NAME}"`,
|
|
1137
|
+
parse: String,
|
|
1138
|
+
optional: true
|
|
1139
|
+
},
|
|
1140
|
+
baseServerUrl: {
|
|
1141
|
+
kind: "parsed",
|
|
1142
|
+
brief: `Base server URL for constructing image URLs in the file list. Default to "${DEFAULT_BASE_SERVER_URL}"`,
|
|
1143
|
+
parse: String,
|
|
1144
|
+
optional: true
|
|
1145
|
+
},
|
|
1146
|
+
sortVertical: {
|
|
1147
|
+
kind: "parsed",
|
|
1148
|
+
brief: `Sort bounding boxes vertically. Options: "${SORT_VERTICAL_NONE}" (default), "${SORT_VERTICAL_TOP_BOTTOM}", "${SORT_VERTICAL_BOTTOM_TOP}"`,
|
|
1149
|
+
parse: String,
|
|
1150
|
+
optional: true
|
|
1151
|
+
},
|
|
1152
|
+
sortHorizontal: {
|
|
1153
|
+
kind: "parsed",
|
|
1154
|
+
brief: `Sort bounding boxes horizontally. Options: "${SORT_HORIZONTAL_NONE}" (default), "${SORT_HORIZONTAL_LTR}", "${SORT_HORIZONTAL_RTL}"`,
|
|
1155
|
+
parse: String,
|
|
1156
|
+
optional: true
|
|
1157
|
+
}
|
|
1158
|
+
}
|
|
1159
|
+
},
|
|
1160
|
+
docs: {
|
|
1161
|
+
brief: "Convert PPOCRLabel files to Label Studio format"
|
|
1162
|
+
}
|
|
1163
|
+
});
|
|
1164
|
+
|
|
1165
|
+
// src/commands/toPPOCR/commands.ts
|
|
1166
|
+
init_esm_shims();
|
|
1167
|
+
init_constants();
|
|
1168
|
+
import { buildCommand as buildCommand2 } from "@stricli/core";
|
|
1169
|
+
var toPPOCRCommand = buildCommand2({
|
|
1170
|
+
loader: async () => {
|
|
1171
|
+
const { convertToPPOCR: convertToPPOCR2 } = await Promise.resolve().then(() => (init_impl2(), impl_exports2));
|
|
1172
|
+
return convertToPPOCR2;
|
|
1173
|
+
},
|
|
1174
|
+
parameters: {
|
|
1175
|
+
positional: {
|
|
1176
|
+
kind: "array",
|
|
1177
|
+
parameter: {
|
|
1178
|
+
brief: "Input directories containing Label Studio files",
|
|
1179
|
+
parse: String
|
|
1180
|
+
},
|
|
1181
|
+
minimum: 1
|
|
1182
|
+
},
|
|
1183
|
+
flags: {
|
|
1184
|
+
outDir: {
|
|
1185
|
+
kind: "parsed",
|
|
1186
|
+
brief: `Output directory. Default to "${OUTPUT_BASE_DIR}"`,
|
|
1187
|
+
parse: String,
|
|
1188
|
+
optional: true
|
|
1189
|
+
},
|
|
1190
|
+
fileName: {
|
|
1191
|
+
kind: "parsed",
|
|
1192
|
+
brief: `Output PPOCR file name. Default to "${DEFAULT_PPOCR_FILE_NAME}"`,
|
|
1193
|
+
parse: String,
|
|
1194
|
+
optional: true
|
|
1195
|
+
},
|
|
1196
|
+
baseImageDir: {
|
|
1197
|
+
kind: "parsed",
|
|
1198
|
+
brief: 'Base directory path to prepend to image filenames in output (e.g., "ch" or "images/ch")',
|
|
1199
|
+
parse: String,
|
|
1200
|
+
optional: true
|
|
1201
|
+
},
|
|
1202
|
+
sortVertical: {
|
|
1203
|
+
kind: "parsed",
|
|
1204
|
+
brief: `Sort bounding boxes vertically. Options: "${SORT_VERTICAL_NONE}" (default), "${SORT_VERTICAL_TOP_BOTTOM}", "${SORT_VERTICAL_BOTTOM_TOP}"`,
|
|
1205
|
+
parse: String,
|
|
1206
|
+
optional: true
|
|
1207
|
+
},
|
|
1208
|
+
sortHorizontal: {
|
|
1209
|
+
kind: "parsed",
|
|
1210
|
+
brief: `Sort bounding boxes horizontally. Options: "${SORT_HORIZONTAL_NONE}" (default), "${SORT_HORIZONTAL_LTR}", "${SORT_HORIZONTAL_RTL}"`,
|
|
1211
|
+
parse: String,
|
|
1212
|
+
optional: true
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
},
|
|
1216
|
+
docs: {
|
|
1217
|
+
brief: "Convert Label Studio files to PPOCRLabel format"
|
|
1218
|
+
}
|
|
1219
|
+
});
|
|
1220
|
+
|
|
1221
|
+
// src/app.ts
|
|
1222
|
+
var routes = buildRouteMap({
|
|
1223
|
+
routes: {
|
|
1224
|
+
toLabelStudio: toLabelStudioCommand,
|
|
1225
|
+
toPPOCR: toPPOCRCommand,
|
|
1226
|
+
install: buildInstallCommand("label-studio-converter", {
|
|
1227
|
+
bash: "__label-studio-converter_bash_complete"
|
|
1228
|
+
}),
|
|
1229
|
+
uninstall: buildUninstallCommand("label-studio-converter", { bash: true })
|
|
1230
|
+
},
|
|
1231
|
+
docs: {
|
|
1232
|
+
brief: description,
|
|
1233
|
+
hideRoute: {
|
|
1234
|
+
install: true,
|
|
1235
|
+
uninstall: true
|
|
1236
|
+
}
|
|
1237
|
+
}
|
|
1238
|
+
});
|
|
1239
|
+
var app = buildApplication(routes, {
|
|
1240
|
+
name: "label-studio-converter",
|
|
1241
|
+
versionInfo: {
|
|
1242
|
+
currentVersion: version
|
|
1243
|
+
}
|
|
1244
|
+
});
|
|
1245
|
+
|
|
1246
|
+
// src/context.ts
|
|
1247
|
+
init_esm_shims();
|
|
1248
|
+
import fs from "fs";
|
|
1249
|
+
import os from "os";
|
|
1250
|
+
import path2 from "path";
|
|
1251
|
+
function buildContext(process2) {
|
|
1252
|
+
return {
|
|
1253
|
+
process: process2,
|
|
1254
|
+
os,
|
|
1255
|
+
fs,
|
|
1256
|
+
path: path2
|
|
1257
|
+
};
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
// src/bin/cli.ts
|
|
1261
|
+
(async () => {
|
|
1262
|
+
run(app, process.argv.slice(2), buildContext(process));
|
|
1263
|
+
})();
|
|
1264
|
+
//# sourceMappingURL=cli.js.map
|