koten-layout-detector 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,130 @@
1
+ # koten-layout-detector
2
+
3
+ Japanese classical document layout analysis library using ONNX Runtime for detecting text regions, illustrations, and stamps in historical Japanese documents.
4
+
5
+ ## Features
6
+
7
+ - 🏯 Specialized for Japanese classical documents (古典籍)
8
+ - 🚀 Browser-based inference using ONNX Runtime Web
9
+ - 📦 Lightweight and easy to integrate
10
+ - 🎯 Detects 5 types of regions:
11
+ - Overall layout (全体)
12
+ - Handwritten text (手書き)
13
+ - Typographic text (活字)
14
+ - Illustrations (図版)
15
+ - Stamps/Seals (印判)
16
+
17
+ ## See It In Action
18
+
19
+ Check out the live demo at [https://koten-layout.netlify.app/](https://koten-layout.netlify.app/)
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ npm install koten-layout-detector onnxruntime-web
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ```javascript
30
+ import {
31
+ loadModel,
32
+ preprocess,
33
+ runInference,
34
+ postprocess,
35
+ drawDetections,
36
+ CLASSES,
37
+ COLORS
38
+ } from 'koten-layout-detector'
39
+
40
+ // Load the ONNX model
41
+ const session = await loadModel('/path/to/your/model.onnx')
42
+
43
+ // Load an image
44
+ const img = new Image()
45
+ img.src = '/path/to/classical-document.jpg'
46
+ await img.decode()
47
+
48
+ // Preprocess the image
49
+ const { tensor, meta } = preprocess(img)
50
+
51
+ // Run inference
52
+ const outputTensor = await runInference(session, tensor)
53
+
54
+ // Postprocess results
55
+ const detections = postprocess(outputTensor, meta, 0.5, 0.45)
56
+
57
+ // Draw detections on canvas
58
+ const canvas = document.getElementById('output-canvas')
59
+ drawDetections(canvas, img, detections)
60
+
61
+ console.log('Detected regions:', detections)
62
+ ```
63
+
64
+ ## API Reference
65
+
66
+ ### `loadModel(modelUrl: string): Promise<InferenceSession>`
67
+
68
+ Loads an ONNX model from the specified URL.
69
+
70
+ ### `preprocess(img: HTMLImageElement): { tensor: Tensor, meta: Object }`
71
+
72
+ Preprocesses an image for inference with letterbox resizing.
73
+
74
+ Returns:
75
+ - `tensor`: ONNX tensor ready for inference
76
+ - `meta`: Metadata for postprocessing (scale, padding, original dimensions)
77
+
78
+ ### `runInference(session: InferenceSession, tensor: Tensor): Promise<Tensor>`
79
+
80
+ Runs inference on the preprocessed tensor.
81
+
82
+ ### `postprocess(outputTensor: Tensor, meta: Object, confThreshold?: number, iouThreshold?: number): Array<Detection>`
83
+
84
+ Postprocesses the model output into detection results.
85
+
86
+ Parameters:
87
+ - `confThreshold`: Confidence threshold (default: 0.5)
88
+ - `iouThreshold`: IoU threshold for NMS (default: 0.45)
89
+
90
+ Returns an array of detections with:
91
+ - `x1, y1, x2, y2`: Bounding box coordinates
92
+ - `conf`: Confidence score
93
+ - `classId`: Class ID
94
+ - `label`: Japanese label
95
+ - `color`: Color for visualization
96
+
97
+ ### `drawDetections(canvas: HTMLCanvasElement, img: HTMLImageElement, detections: Array<Detection>): void`
98
+
99
+ Draws the original image and detection boxes on a canvas.
100
+
101
+ ### `CLASSES`
102
+
103
+ Array of class definitions with ID, key, and Japanese labels.
104
+
105
+ ### `COLORS`
106
+
107
+ Array of colors for each class for visualization.
108
+
109
+ ## Dataset
110
+
111
+ This model is trained on the [NDL-DocL Layout Dataset](https://github.com/ndl-lab/layout-dataset) provided by the National Diet Library of Japan. The dataset contains annotated layout information for Japanese classical documents.
112
+
113
+ ## Model
114
+
115
+ The detection model is based on YOLOv12, optimized for classical Japanese document analysis.
116
+
117
+ **Note:** You need to provide your own trained ONNX model file. This library provides the inference pipeline but does not include the model weights.
118
+
119
+ ## License
120
+
121
+ MIT
122
+
123
+ ## Contributing
124
+
125
+ Contributions are welcome! Please feel free to submit a Pull Request.
126
+
127
+ ## Acknowledgments
128
+
129
+ - [NDL-DocL Layout Dataset](https://github.com/ndl-lab/layout-dataset) - National Diet Library of Japan
130
+ - ONNX Runtime Web team for the excellent inference engine
package/package.json ADDED
@@ -0,0 +1,41 @@
1
+ {
2
+ "name": "koten-layout-detector",
3
+ "version": "1.0.0",
4
+ "description": "Japanese classical document layout analysis library using ONNX Runtime for detecting text regions, illustrations, and stamps in historical Japanese documents",
5
+ "main": "src/index.js",
6
+ "type": "module",
7
+ "scripts": {
8
+ "test": "echo \"Error: no test specified\" && exit 1"
9
+ },
10
+ "keywords": [
11
+ "onnx",
12
+ "layout-analysis",
13
+ "document-analysis",
14
+ "japanese",
15
+ "classical-documents",
16
+ "yolo",
17
+ "object-detection",
18
+ "koten"
19
+ ],
20
+ "author": "",
21
+ "license": "MIT",
22
+ "repository": {
23
+ "type": "git",
24
+ "url": "https://github.com/yuta1984/koten-layout-detector.git"
25
+ },
26
+ "bugs": {
27
+ "url": "https://github.com/yuta1984/koten-layout-detector/issues"
28
+ },
29
+ "homepage": "https://github.com/yuta1984/koten-layout-detector#readme",
30
+ "peerDependencies": {
31
+ "onnxruntime-web": "^1.20.0"
32
+ },
33
+ "devDependencies": {
34
+ "onnxruntime-web": "^1.20.1"
35
+ },
36
+ "files": [
37
+ "src/**/*",
38
+ "README.md",
39
+ "LICENSE"
40
+ ]
41
+ }
package/src/index.js ADDED
@@ -0,0 +1,19 @@
1
+ /**
2
+ * koten-layout-detector
3
+ * Japanese classical document layout analysis library
4
+ *
5
+ * Trained on NDL-DocL Layout Dataset
6
+ * https://github.com/ndl-lab/layout-dataset
7
+ */
8
+
9
+ export {
10
+ loadModel,
11
+ preprocess,
12
+ runInference,
13
+ postprocess,
14
+ drawDetections,
15
+ CLASSES,
16
+ COLORS
17
+ } from './inference.js'
18
+
19
+ export { iou, nms } from './nms.js'
@@ -0,0 +1,204 @@
1
+ import * as ort from 'onnxruntime-web'
2
+ import { nms } from './nms.js'
3
+
4
+ /** モデル入力サイズ */
5
+ const MODEL_SIZE = 640
6
+
7
+ /** レターボックスのパディング色(YOLO デフォルト: グレー 114) */
8
+ const PAD_COLOR = 114
9
+
10
+ /**
11
+ * クラス定義
12
+ * NDL-DocL 古典籍データセットの 5 クラス
13
+ */
14
+ export const CLASSES = [
15
+ { id: 0, key: '1_overall', ja: '全体' },
16
+ { id: 1, key: '2_handwritten', ja: '手書き' },
17
+ { id: 2, key: '3_typography', ja: '活字' },
18
+ { id: 3, key: '4_illustration', ja: '図版' },
19
+ { id: 4, key: '5_stamp', ja: '印判' },
20
+ ]
21
+
22
+ export const COLORS = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6']
23
+
24
+ // ---------------------------------------------------------------------------
25
+ // モデルのロード
26
+ // ---------------------------------------------------------------------------
27
+
28
+ /**
29
+ * ONNX セッションを作成して返す
30
+ * @param {string} modelUrl - モデルファイルの URL
31
+ * @returns {Promise<ort.InferenceSession>}
32
+ */
33
+ export async function loadModel(modelUrl) {
34
+ // WASM ファイルのパスを明示(Vite の static copy 先に合わせる)
35
+ ort.env.wasm.wasmPaths = '/'
36
+
37
+ const session = await ort.InferenceSession.create(modelUrl, {
38
+ executionProviders: ['wasm'],
39
+ graphOptimizationLevel: 'all',
40
+ })
41
+ return session
42
+ }
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // 前処理
46
+ // ---------------------------------------------------------------------------
47
+
48
+ /**
49
+ * 画像をレターボックスリサイズして Float32Array テンソルに変換する
50
+ * @param {HTMLImageElement} img
51
+ * @returns {{ tensor: ort.Tensor, meta: Object }}
52
+ * meta: { scale, padX, padY, origW, origH }
53
+ */
54
+ export function preprocess(img) {
55
+ const canvas = document.createElement('canvas')
56
+ canvas.width = MODEL_SIZE
57
+ canvas.height = MODEL_SIZE
58
+ const ctx = canvas.getContext('2d')
59
+
60
+ // パディング色で塗りつぶし
61
+ ctx.fillStyle = `rgb(${PAD_COLOR},${PAD_COLOR},${PAD_COLOR})`
62
+ ctx.fillRect(0, 0, MODEL_SIZE, MODEL_SIZE)
63
+
64
+ // アスペクト比を保ったまま縮小
65
+ const scale = Math.min(MODEL_SIZE / img.width, MODEL_SIZE / img.height)
66
+ const newW = Math.round(img.width * scale)
67
+ const newH = Math.round(img.height * scale)
68
+ const padX = Math.floor((MODEL_SIZE - newW) / 2)
69
+ const padY = Math.floor((MODEL_SIZE - newH) / 2)
70
+
71
+ ctx.drawImage(img, padX, padY, newW, newH)
72
+
73
+ const { data } = ctx.getImageData(0, 0, MODEL_SIZE, MODEL_SIZE)
74
+
75
+ // HWC (RGBA) → CHW (RGB) Float32 & 正規化 ÷255
76
+ const float32 = new Float32Array(3 * MODEL_SIZE * MODEL_SIZE)
77
+ const pixelCount = MODEL_SIZE * MODEL_SIZE
78
+ for (let i = 0; i < pixelCount; i++) {
79
+ float32[i] = data[i * 4] / 255.0 // R
80
+ float32[i + pixelCount] = data[i * 4 + 1] / 255.0 // G
81
+ float32[i + pixelCount * 2] = data[i * 4 + 2] / 255.0 // B
82
+ }
83
+
84
+ return {
85
+ tensor: new ort.Tensor('float32', float32, [1, 3, MODEL_SIZE, MODEL_SIZE]),
86
+ meta: { scale, padX, padY, origW: img.width, origH: img.height },
87
+ }
88
+ }
89
+
90
+ // ---------------------------------------------------------------------------
91
+ // 推論
92
+ // ---------------------------------------------------------------------------
93
+
94
+ /**
95
+ * ONNX セッションで推論を実行する
96
+ * @param {ort.InferenceSession} session
97
+ * @param {ort.Tensor} tensor
98
+ * @returns {Promise<ort.Tensor>} 出力テンソル [1, 9, 8400]
99
+ */
100
+ export async function runInference(session, tensor) {
101
+ const inputName = session.inputNames[0]
102
+ const feeds = { [inputName]: tensor }
103
+ const results = await session.run(feeds)
104
+ return results[session.outputNames[0]]
105
+ }
106
+
107
+ // ---------------------------------------------------------------------------
108
+ // 後処理
109
+ // ---------------------------------------------------------------------------
110
+
111
+ /**
112
+ * ONNX 出力テンソルを検出結果に変換する
113
+ * @param {ort.Tensor} outputTensor - shape [1, 4+nc, 8400]
114
+ * @param {Object} meta - preprocess() が返す meta
115
+ * @param {number} confThreshold - 信頼度閾値(デフォルト 0.5)
116
+ * @param {number} iouThreshold - NMS IoU 閾値(デフォルト 0.45)
117
+ * @returns {Array} 検出結果リスト { x1, y1, x2, y2, conf, classId, label, color }
118
+ */
119
+ export function postprocess(outputTensor, meta, confThreshold = 0.5, iouThreshold = 0.45) {
120
+ const [, numChannels, numPreds] = outputTensor.dims
121
+ const data = outputTensor.data
122
+ const nc = numChannels - 4 // クラス数
123
+
124
+ const raw = []
125
+
126
+ for (let i = 0; i < numPreds; i++) {
127
+ // クラス別スコアの最大値と ID を取得
128
+ let maxScore = -Infinity
129
+ let classId = 0
130
+ for (let c = 0; c < nc; c++) {
131
+ const score = data[(4 + c) * numPreds + i]
132
+ if (score > maxScore) {
133
+ maxScore = score
134
+ classId = c
135
+ }
136
+ }
137
+
138
+ if (maxScore < confThreshold) continue
139
+
140
+ // cx, cy, w, h(640px スケール)→ x1, y1, x2, y2(元画像スケール)
141
+ const cx = data[0 * numPreds + i]
142
+ const cy = data[1 * numPreds + i]
143
+ const w = data[2 * numPreds + i]
144
+ const h = data[3 * numPreds + i]
145
+
146
+ // レターボックスのパディングと縮小率を元に戻す
147
+ const x1 = ((cx - w / 2) - meta.padX) / meta.scale
148
+ const y1 = ((cy - h / 2) - meta.padY) / meta.scale
149
+ const x2 = ((cx + w / 2) - meta.padX) / meta.scale
150
+ const y2 = ((cy + h / 2) - meta.padY) / meta.scale
151
+
152
+ raw.push({ x1, y1, x2, y2, conf: maxScore, classId })
153
+ }
154
+
155
+ const kept = nms(raw, iouThreshold)
156
+
157
+ return kept.map((d) => ({
158
+ ...d,
159
+ label: CLASSES[d.classId]?.ja ?? String(d.classId),
160
+ color: COLORS[d.classId] ?? '#ffffff',
161
+ }))
162
+ }
163
+
164
+ // ---------------------------------------------------------------------------
165
+ // Canvas 描画
166
+ // ---------------------------------------------------------------------------
167
+
168
+ /**
169
+ * 元画像と検出結果を Canvas に描画する
170
+ * @param {HTMLCanvasElement} canvas
171
+ * @param {HTMLImageElement} img
172
+ * @param {Array} detections - postprocess() の戻り値
173
+ */
174
+ export function drawDetections(canvas, img, detections) {
175
+ canvas.width = img.width
176
+ canvas.height = img.height
177
+ const ctx = canvas.getContext('2d')
178
+ ctx.drawImage(img, 0, 0)
179
+
180
+ for (const d of detections) {
181
+ const x1 = Math.max(0, d.x1)
182
+ const y1 = Math.max(0, d.y1)
183
+ const bw = d.x2 - x1
184
+ const bh = d.y2 - y1
185
+
186
+ // ボックス
187
+ ctx.strokeStyle = d.color
188
+ ctx.lineWidth = Math.max(2, img.width / 300)
189
+ ctx.strokeRect(x1, y1, bw, bh)
190
+
191
+ // ラベル背景
192
+ const fontSize = Math.max(14, img.width / 50)
193
+ ctx.font = `bold ${fontSize}px sans-serif`
194
+ const text = `${d.label} ${(d.conf * 100).toFixed(0)}%`
195
+ const textW = ctx.measureText(text).width
196
+ const textH = fontSize * 1.4
197
+ ctx.fillStyle = d.color
198
+ ctx.fillRect(x1, y1 - textH, textW + 8, textH)
199
+
200
+ // ラベルテキスト
201
+ ctx.fillStyle = '#ffffff'
202
+ ctx.fillText(text, x1 + 4, y1 - fontSize * 0.2)
203
+ }
204
+ }
package/src/nms.js ADDED
@@ -0,0 +1,48 @@
1
+ /**
2
+ * IoU(Intersection over Union)を計算する
3
+ * @param {Object} a - { x1, y1, x2, y2 }
4
+ * @param {Object} b - { x1, y1, x2, y2 }
5
+ * @returns {number} IoU 値(0〜1)
6
+ */
7
+ export function iou(a, b) {
8
+ const ix1 = Math.max(a.x1, b.x1)
9
+ const iy1 = Math.max(a.y1, b.y1)
10
+ const ix2 = Math.min(a.x2, b.x2)
11
+ const iy2 = Math.min(a.y2, b.y2)
12
+
13
+ const interW = Math.max(0, ix2 - ix1)
14
+ const interH = Math.max(0, iy2 - iy1)
15
+ const interArea = interW * interH
16
+
17
+ const areaA = (a.x2 - a.x1) * (a.y2 - a.y1)
18
+ const areaB = (b.x2 - b.x1) * (b.y2 - b.y1)
19
+ const unionArea = areaA + areaB - interArea
20
+
21
+ return unionArea <= 0 ? 0 : interArea / unionArea
22
+ }
23
+
24
+ /**
25
+ * Non-Maximum Suppression を適用する
26
+ * @param {Array} detections - 検出結果の配列 ({ x1, y1, x2, y2, conf, classId })
27
+ * @param {number} iouThreshold - IoU 閾値(デフォルト 0.45)
28
+ * @returns {Array} NMS 後の検出結果
29
+ */
30
+ export function nms(detections, iouThreshold = 0.45) {
31
+ // クラスごとに NMS を適用
32
+ const classIds = [...new Set(detections.map((d) => d.classId))]
33
+ const result = []
34
+
35
+ for (const cid of classIds) {
36
+ let boxes = detections
37
+ .filter((d) => d.classId === cid)
38
+ .sort((a, b) => b.conf - a.conf) // 信頼度の高い順にソート
39
+
40
+ while (boxes.length > 0) {
41
+ const best = boxes.shift()
42
+ result.push(best)
43
+ boxes = boxes.filter((b) => iou(best, b) < iouThreshold)
44
+ }
45
+ }
46
+
47
+ return result
48
+ }