macos-vision 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +161 -103
- package/bin/pdf-helper +0 -0
- package/bin/vision-helper +0 -0
- package/dist/cli.js +131 -68
- package/dist/index.d.ts +2 -0
- package/dist/index.js +5 -3
- package/dist/markdown/chunker.d.ts +11 -0
- package/dist/markdown/chunker.js +39 -0
- package/dist/markdown/index.d.ts +61 -0
- package/dist/markdown/index.js +92 -0
- package/dist/markdown/ollama.d.ts +21 -0
- package/dist/markdown/ollama.js +50 -0
- package/dist/markdown/prompt.d.ts +35 -0
- package/dist/markdown/prompt.js +82 -0
- package/package.json +30 -5
- package/src/native/pdf-helper.swift +122 -0
- package/src/native/vision-helper.swift +241 -0
- package/.husky/commit-msg +0 -2
- package/.husky/pre-commit +0 -3
- package/.prettierignore +0 -4
- package/.prettierrc.json +0 -7
- package/.release-it.json +0 -20
- package/CHANGELOG.md +0 -44
- package/commitlint.config.js +0 -1
- package/debug.js +0 -37
- package/eslint.config.js +0 -21
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import Vision
|
|
2
|
+
import AppKit
|
|
3
|
+
import Foundation
|
|
4
|
+
|
|
5
|
+
// ─── Result structs ──────────────────────────────────────────────────────────
|
|
6
|
+
|
|
7
|
+
struct OCRResult: Codable {
|
|
8
|
+
let t: String
|
|
9
|
+
let x: Double; let y: Double; let w: Double; let h: Double
|
|
10
|
+
let confidence: Float
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
struct FaceResult: Codable {
|
|
14
|
+
let x: Double; let y: Double; let w: Double; let h: Double
|
|
15
|
+
let confidence: Float
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
struct BarcodeResult: Codable {
|
|
19
|
+
let type: String
|
|
20
|
+
let value: String
|
|
21
|
+
let x: Double; let y: Double; let w: Double; let h: Double
|
|
22
|
+
let confidence: Float
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
struct RectangleResult: Codable {
|
|
26
|
+
let topLeft: [Double]; let topRight: [Double]
|
|
27
|
+
let bottomLeft: [Double]; let bottomRight: [Double]
|
|
28
|
+
let confidence: Float
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
struct DocumentResult: Codable {
|
|
32
|
+
let topLeft: [Double]; let topRight: [Double]
|
|
33
|
+
let bottomLeft: [Double]; let bottomRight: [Double]
|
|
34
|
+
let confidence: Float
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
struct ClassificationResult: Codable {
|
|
38
|
+
let identifier: String
|
|
39
|
+
let confidence: Float
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
43
|
+
|
|
44
|
+
// Vision: 0,0 = bottom-left. We flip Y so 0,0 = top-left (web standard).
|
|
45
|
+
func flipY(_ y: Double, _ h: Double) -> Double { 1.0 - y - h }
|
|
46
|
+
|
|
47
|
+
func pt(_ p: CGPoint) -> [Double] { [Double(p.x), 1.0 - Double(p.y)] }
|
|
48
|
+
|
|
49
|
+
func encodeJSON<T: Encodable>(_ value: T) -> String {
|
|
50
|
+
guard let data = try? JSONEncoder().encode(value),
|
|
51
|
+
let str = String(data: data, encoding: .utf8) else { return "[]" }
|
|
52
|
+
return str
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// ─── Argument parsing ─────────────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
let args = CommandLine.arguments
|
|
58
|
+
let isJsonMode = args.contains("--json")
|
|
59
|
+
let isFaces = args.contains("--faces")
|
|
60
|
+
let isBarcodes = args.contains("--barcodes")
|
|
61
|
+
let isRectangles = args.contains("--rectangles")
|
|
62
|
+
let isDocument = args.contains("--document")
|
|
63
|
+
let isClassify = args.contains("--classify")
|
|
64
|
+
|
|
65
|
+
let fileArgs = args.filter { !$0.hasPrefix("--") && !$0.contains("vision-helper") }
|
|
66
|
+
|
|
67
|
+
guard let imagePath = fileArgs.first else {
|
|
68
|
+
print("Usage: vision-helper [--json|--faces|--barcodes|--rectangles|--document|--classify] <path>")
|
|
69
|
+
exit(0)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
guard let image = NSImage(contentsOf: URL(fileURLWithPath: imagePath)),
|
|
73
|
+
let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
|
74
|
+
fputs("ERROR: Cannot open file: \(imagePath)\n", stderr)
|
|
75
|
+
exit(1)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
|
79
|
+
|
|
80
|
+
// ─── OCR (default + --json) ───────────────────────────────────────────────────
|
|
81
|
+
|
|
82
|
+
if isJsonMode || (!isFaces && !isBarcodes && !isRectangles && !isDocument && !isClassify) {
|
|
83
|
+
var ocrResults: [OCRResult] = []
|
|
84
|
+
var rawText = ""
|
|
85
|
+
|
|
86
|
+
let request = VNRecognizeTextRequest { (req, _) in
|
|
87
|
+
guard let obs = req.results as? [VNRecognizedTextObservation] else { return }
|
|
88
|
+
for o in obs {
|
|
89
|
+
guard let c = o.topCandidates(1).first else { continue }
|
|
90
|
+
let box = o.boundingBox
|
|
91
|
+
if isJsonMode {
|
|
92
|
+
ocrResults.append(OCRResult(
|
|
93
|
+
t: c.string,
|
|
94
|
+
x: Double(box.origin.x),
|
|
95
|
+
y: flipY(Double(box.origin.y), Double(box.size.height)),
|
|
96
|
+
w: Double(box.size.width),
|
|
97
|
+
h: Double(box.size.height),
|
|
98
|
+
confidence: c.confidence
|
|
99
|
+
))
|
|
100
|
+
} else {
|
|
101
|
+
rawText += c.string + "\n"
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
request.recognitionLevel = .accurate
|
|
106
|
+
|
|
107
|
+
do {
|
|
108
|
+
try handler.perform([request])
|
|
109
|
+
} catch {
|
|
110
|
+
fputs("ERROR: Vision OCR failed: \(error.localizedDescription)\n", stderr)
|
|
111
|
+
exit(1)
|
|
112
|
+
}
|
|
113
|
+
print(isJsonMode ? encodeJSON(ocrResults) : rawText.trimmingCharacters(in: .whitespacesAndNewlines))
|
|
114
|
+
exit(0)
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// ─── Faces ───────────────────────────────────────────────────────────────────
|
|
118
|
+
|
|
119
|
+
if isFaces {
|
|
120
|
+
var results: [FaceResult] = []
|
|
121
|
+
let request = VNDetectFaceRectanglesRequest { (req, _) in
|
|
122
|
+
guard let obs = req.results as? [VNFaceObservation] else { return }
|
|
123
|
+
for o in obs {
|
|
124
|
+
let box = o.boundingBox
|
|
125
|
+
results.append(FaceResult(
|
|
126
|
+
x: Double(box.origin.x),
|
|
127
|
+
y: flipY(Double(box.origin.y), Double(box.size.height)),
|
|
128
|
+
w: Double(box.size.width),
|
|
129
|
+
h: Double(box.size.height),
|
|
130
|
+
confidence: o.confidence
|
|
131
|
+
))
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
do {
|
|
135
|
+
try handler.perform([request])
|
|
136
|
+
} catch {
|
|
137
|
+
fputs("ERROR: Vision face detection failed: \(error.localizedDescription)\n", stderr)
|
|
138
|
+
exit(1)
|
|
139
|
+
}
|
|
140
|
+
print(encodeJSON(results))
|
|
141
|
+
exit(0)
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// ─── Barcodes ────────────────────────────────────────────────────────────────
|
|
145
|
+
|
|
146
|
+
if isBarcodes {
|
|
147
|
+
var results: [BarcodeResult] = []
|
|
148
|
+
let request = VNDetectBarcodesRequest { (req, _) in
|
|
149
|
+
guard let obs = req.results as? [VNBarcodeObservation] else { return }
|
|
150
|
+
for o in obs {
|
|
151
|
+
let box = o.boundingBox
|
|
152
|
+
results.append(BarcodeResult(
|
|
153
|
+
type: o.symbology.rawValue,
|
|
154
|
+
value: o.payloadStringValue ?? "",
|
|
155
|
+
x: Double(box.origin.x),
|
|
156
|
+
y: flipY(Double(box.origin.y), Double(box.size.height)),
|
|
157
|
+
w: Double(box.size.width),
|
|
158
|
+
h: Double(box.size.height),
|
|
159
|
+
confidence: o.confidence
|
|
160
|
+
))
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
do {
|
|
164
|
+
try handler.perform([request])
|
|
165
|
+
} catch {
|
|
166
|
+
fputs("ERROR: Vision barcode detection failed: \(error.localizedDescription)\n", stderr)
|
|
167
|
+
exit(1)
|
|
168
|
+
}
|
|
169
|
+
print(encodeJSON(results))
|
|
170
|
+
exit(0)
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// ─── Rectangles ──────────────────────────────────────────────────────────────
|
|
174
|
+
|
|
175
|
+
if isRectangles {
|
|
176
|
+
var results: [RectangleResult] = []
|
|
177
|
+
let request = VNDetectRectanglesRequest { (req, _) in
|
|
178
|
+
guard let obs = req.results as? [VNRectangleObservation] else { return }
|
|
179
|
+
for o in obs {
|
|
180
|
+
results.append(RectangleResult(
|
|
181
|
+
topLeft: pt(o.topLeft), topRight: pt(o.topRight),
|
|
182
|
+
bottomLeft: pt(o.bottomLeft), bottomRight: pt(o.bottomRight),
|
|
183
|
+
confidence: o.confidence
|
|
184
|
+
))
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
(request as VNDetectRectanglesRequest).maximumObservations = 0
|
|
188
|
+
do {
|
|
189
|
+
try handler.perform([request])
|
|
190
|
+
} catch {
|
|
191
|
+
fputs("ERROR: Vision rectangle detection failed: \(error.localizedDescription)\n", stderr)
|
|
192
|
+
exit(1)
|
|
193
|
+
}
|
|
194
|
+
print(encodeJSON(results))
|
|
195
|
+
exit(0)
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// ─── Document ────────────────────────────────────────────────────────────────
|
|
199
|
+
|
|
200
|
+
if isDocument {
|
|
201
|
+
var results: [DocumentResult] = []
|
|
202
|
+
let request = VNDetectDocumentSegmentationRequest { (req, _) in
|
|
203
|
+
guard let obs = req.results as? [VNRectangleObservation] else { return }
|
|
204
|
+
for o in obs {
|
|
205
|
+
results.append(DocumentResult(
|
|
206
|
+
topLeft: pt(o.topLeft), topRight: pt(o.topRight),
|
|
207
|
+
bottomLeft: pt(o.bottomLeft), bottomRight: pt(o.bottomRight),
|
|
208
|
+
confidence: o.confidence
|
|
209
|
+
))
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
do {
|
|
213
|
+
try handler.perform([request])
|
|
214
|
+
} catch {
|
|
215
|
+
fputs("ERROR: Vision document detection failed: \(error.localizedDescription)\n", stderr)
|
|
216
|
+
exit(1)
|
|
217
|
+
}
|
|
218
|
+
print(encodeJSON(results))
|
|
219
|
+
exit(0)
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// ─── Classify ────────────────────────────────────────────────────────────────
|
|
223
|
+
|
|
224
|
+
if isClassify {
|
|
225
|
+
var results: [ClassificationResult] = []
|
|
226
|
+
let request = VNClassifyImageRequest { (req, _) in
|
|
227
|
+
guard let obs = req.results as? [VNClassificationObservation] else { return }
|
|
228
|
+
let top = obs.filter { $0.confidence > 0.01 }.prefix(10)
|
|
229
|
+
for o in top {
|
|
230
|
+
results.append(ClassificationResult(identifier: o.identifier, confidence: o.confidence))
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
do {
|
|
234
|
+
try handler.perform([request])
|
|
235
|
+
} catch {
|
|
236
|
+
fputs("ERROR: Vision classification failed: \(error.localizedDescription)\n", stderr)
|
|
237
|
+
exit(1)
|
|
238
|
+
}
|
|
239
|
+
print(encodeJSON(results))
|
|
240
|
+
exit(0)
|
|
241
|
+
}
|
package/.husky/commit-msg
DELETED
package/.husky/pre-commit
DELETED
package/.prettierignore
DELETED
package/.prettierrc.json
DELETED
package/.release-it.json
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "https://unpkg.com/release-it/schema/release-it.json",
|
|
3
|
-
"plugins": {
|
|
4
|
-
"@release-it/conventional-changelog": {
|
|
5
|
-
"preset": "conventionalcommits",
|
|
6
|
-
"infile": "CHANGELOG.md"
|
|
7
|
-
}
|
|
8
|
-
},
|
|
9
|
-
"git": {
|
|
10
|
-
"commitMessage": "chore(release): v${version}",
|
|
11
|
-
"tagName": "v${version}"
|
|
12
|
-
},
|
|
13
|
-
"github": {
|
|
14
|
-
"release": false
|
|
15
|
-
},
|
|
16
|
-
"npm": {
|
|
17
|
-
"publish": true,
|
|
18
|
-
"tag": "latest"
|
|
19
|
-
}
|
|
20
|
-
}
|
package/CHANGELOG.md
DELETED
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
# Changelog
|
|
2
|
-
|
|
3
|
-
## [1.2.0](https://github.com/woladi/macos-vision/compare/v1.1.0...v1.2.0) (2026-04-09)
|
|
4
|
-
|
|
5
|
-
### Features
|
|
6
|
-
|
|
7
|
-
* replace sips with PDFKit-based pdf-helper binary for PDF rasterization ([4a223e2](https://github.com/woladi/macos-vision/commit/4a223e2de79571794d866452fd5e87b84590ff0d))
|
|
8
|
-
|
|
9
|
-
## [1.1.0](https://github.com/woladi/macos-vision/compare/v1.0.3...v1.1.0) (2026-04-09)
|
|
10
|
-
|
|
11
|
-
### Features
|
|
12
|
-
|
|
13
|
-
* add PDF support via sips rasterization ([a48bf17](https://github.com/woladi/macos-vision/commit/a48bf17579a6df11aed6eadbde4fa5041ccaa981))
|
|
14
|
-
|
|
15
|
-
## [1.0.3](https://github.com/woladi/macos-vision/compare/v1.0.2...v1.0.3) (2026-04-08)
|
|
16
|
-
|
|
17
|
-
### Reverts
|
|
18
|
-
|
|
19
|
-
* remove socket.ignore field — worsens supply chain risk score ([a1827ad](https://github.com/woladi/macos-vision/commit/a1827ad489220ebb7a2e8c85632945fe969438db))
|
|
20
|
-
|
|
21
|
-
## [1.0.2](https://github.com/woladi/macos-vision/compare/v1.0.1...v1.0.2) (2026-04-08)
|
|
22
|
-
|
|
23
|
-
## [1.0.1](https://github.com/woladi/macos-vision/compare/v0.3.1...v1.0.1) (2026-04-08)
|
|
24
|
-
|
|
25
|
-
## [0.3.1](https://github.com/woladi/macos-vision/compare/v0.3.0...v0.3.1) (2026-04-08)
|
|
26
|
-
|
|
27
|
-
## [0.3.0](https://github.com/woladi/macos-vision/compare/v0.2.0...v0.3.0) (2026-04-08)
|
|
28
|
-
|
|
29
|
-
### Features
|
|
30
|
-
|
|
31
|
-
* add inferLayout() — unified reading-order LayoutBlock representation ([aec507e](https://github.com/woladi/macos-vision/commit/aec507eb7cf133ec1e56759c0945563a48d871ee))
|
|
32
|
-
|
|
33
|
-
## [0.2.0](https://github.com/woladi/macos-vision/compare/v0.1.4...v0.2.0) (2026-04-08)
|
|
34
|
-
|
|
35
|
-
### Features
|
|
36
|
-
|
|
37
|
-
* add confidence to VisionBlock and Barcode ([a87df27](https://github.com/woladi/macos-vision/commit/a87df275e51dec4b57fbff6e3bffc4220b96b4d7))
|
|
38
|
-
|
|
39
|
-
### Bug Fixes
|
|
40
|
-
|
|
41
|
-
* correct mkdirSync, CLI error on missing file, execFile timeout, README scope ([1cef2c7](https://github.com/woladi/macos-vision/commit/1cef2c7078430c9182fcd39792cf0c002833203f))
|
|
42
|
-
* replace try? with do/catch in Swift helper — surface Vision errors properly ([f287065](https://github.com/woladi/macos-vision/commit/f2870655225806070be3db462ea15923201fecbf))
|
|
43
|
-
|
|
44
|
-
## 0.1.4 (2026-04-08)
|
package/commitlint.config.js
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export default { extends: ['@commitlint/config-conventional'] };
|
package/debug.js
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
ocr,
|
|
4
|
-
detectFaces,
|
|
5
|
-
detectBarcodes,
|
|
6
|
-
detectRectangles,
|
|
7
|
-
detectDocument,
|
|
8
|
-
classify,
|
|
9
|
-
inferLayout,
|
|
10
|
-
} from './dist/index.js';
|
|
11
|
-
|
|
12
|
-
const imagePath = process.argv[2] || './test/fixtures/sample.png';
|
|
13
|
-
console.log(`\n📸 Analyzing: ${imagePath}\n`);
|
|
14
|
-
|
|
15
|
-
const [text, blocks, faces, barcodes, rects, doc, labels] = await Promise.all([
|
|
16
|
-
ocr(imagePath),
|
|
17
|
-
ocr(imagePath, { format: 'blocks' }),
|
|
18
|
-
detectFaces(imagePath),
|
|
19
|
-
detectBarcodes(imagePath),
|
|
20
|
-
detectRectangles(imagePath),
|
|
21
|
-
detectDocument(imagePath),
|
|
22
|
-
classify(imagePath),
|
|
23
|
-
]);
|
|
24
|
-
|
|
25
|
-
const sep = (title) => console.log('\n' + '─'.repeat(60) + '\n' + title + '\n');
|
|
26
|
-
|
|
27
|
-
sep('📝 OCR text'); console.log(text);
|
|
28
|
-
sep('📝 OCR blocks'); console.log(JSON.stringify(blocks, null, 2));
|
|
29
|
-
sep('👤 Faces'); console.log(JSON.stringify(faces, null, 2));
|
|
30
|
-
sep('🔲 Barcodes'); console.log(JSON.stringify(barcodes, null, 2));
|
|
31
|
-
sep('📦 Rectangles'); console.log(JSON.stringify(rects, null, 2));
|
|
32
|
-
sep('📄 Document'); console.log(JSON.stringify(doc, null, 2));
|
|
33
|
-
sep('🏷️ Classification'); console.log(JSON.stringify(labels, null, 2));
|
|
34
|
-
|
|
35
|
-
const layout = inferLayout({ textBlocks: blocks, faces, barcodes, rectangles: rects, document: doc });
|
|
36
|
-
sep('🗂️ Layout (reading order)'); console.log(JSON.stringify(layout, null, 2));
|
|
37
|
-
console.log('\n' + '─'.repeat(60) + '\n');
|
package/eslint.config.js
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import tseslint from 'typescript-eslint';
|
|
2
|
-
import prettier from 'eslint-config-prettier';
|
|
3
|
-
|
|
4
|
-
export default tseslint.config(
|
|
5
|
-
...tseslint.configs.recommended,
|
|
6
|
-
prettier,
|
|
7
|
-
{
|
|
8
|
-
files: ['src/**/*.ts'],
|
|
9
|
-
languageOptions: {
|
|
10
|
-
parser: tseslint.parser,
|
|
11
|
-
parserOptions: {
|
|
12
|
-
project: true, // Szuka najbliższego tsconfig.json
|
|
13
|
-
},
|
|
14
|
-
},
|
|
15
|
-
rules: {
|
|
16
|
-
'@typescript-eslint/no-explicit-any': 'warn',
|
|
17
|
-
'@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }],
|
|
18
|
-
},
|
|
19
|
-
},
|
|
20
|
-
{ ignores: ['dist/**', 'node_modules/**', 'bin/**'] }
|
|
21
|
-
);
|