macos-vision 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ import PDFKit
2
+ import AppKit
3
+ import Foundation
4
+
5
+ // ─── Result struct ────────────────────────────────────────────────────────────
6
+
7
+ struct PageResult: Codable {
8
+ let page: Int // 0-based
9
+ let path: String
10
+ }
11
+
12
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
13
+
14
+ func fail(_ message: String) -> Never {
15
+ fputs("ERROR: \(message)\n", stderr)
16
+ exit(1)
17
+ }
18
+
19
+ func encodeJSON<T: Encodable>(_ value: T) -> String {
20
+ guard let data = try? JSONEncoder().encode(value),
21
+ let str = String(data: data, encoding: .utf8) else { return "[]" }
22
+ return str
23
+ }
24
+
25
+ // ─── Argument parsing ─────────────────────────────────────────────────────────
26
+
27
+ let args = CommandLine.arguments
28
+ guard args.count >= 2 else {
29
+ fail("Usage: pdf-helper <path-to-pdf>")
30
+ }
31
+
32
+ let pdfPath = args[1]
33
+ let pdfURL = URL(fileURLWithPath: pdfPath)
34
+
35
+ guard let pdf = PDFDocument(url: pdfURL) else {
36
+ fail("Cannot open PDF: \(pdfPath)")
37
+ }
38
+
39
+ let pageCount = pdf.pageCount
40
+ guard pageCount > 0 else {
41
+ fail("PDF has no pages: \(pdfPath)")
42
+ }
43
+
44
+ // ─── Output directory: ~/.cache/macos-vision/{basename}-{uuid}/ ───────────────
45
+
46
+ let basename = pdfURL.deletingPathExtension().lastPathComponent
47
+ let uuid = UUID().uuidString.lowercased()
48
+ let cacheBase = FileManager.default.homeDirectoryForCurrentUser
49
+ .appendingPathComponent(".cache/macos-vision")
50
+ let outDir = cacheBase.appendingPathComponent("\(basename)-\(uuid)")
51
+
52
+ do {
53
+ try FileManager.default.createDirectory(at: outDir, withIntermediateDirectories: true)
54
+ } catch {
55
+ fail("Cannot create output directory \(outDir.path): \(error.localizedDescription)")
56
+ }
57
+
58
+ // ─── Rasterize each page at 300 DPI ──────────────────────────────────────────
59
+
60
+ // PDF points are 72 pt/inch. Scale factor for 300 DPI = 300/72 ≈ 4.167
61
+ let scale: CGFloat = 300.0 / 72.0
62
+
63
+ var results: [PageResult] = []
64
+
65
+ for pageIndex in 0..<pageCount {
66
+ guard let page = pdf.page(at: pageIndex) else {
67
+ fail("Cannot access page \(pageIndex) of \(pdfPath)")
68
+ }
69
+
70
+ let mediaBox = page.bounds(for: .mediaBox)
71
+ let width = Int((mediaBox.width * scale).rounded())
72
+ let height = Int((mediaBox.height * scale).rounded())
73
+
74
+ guard let bitmapRep = NSBitmapImageRep(
75
+ bitmapDataPlanes: nil,
76
+ pixelsWide: width,
77
+ pixelsHigh: height,
78
+ bitsPerSample: 8,
79
+ samplesPerPixel: 4,
80
+ hasAlpha: true,
81
+ isPlanar: false,
82
+ colorSpaceName: .calibratedRGB,
83
+ bytesPerRow: 0,
84
+ bitsPerPixel: 0
85
+ ) else {
86
+ fail("Cannot create bitmap for page \(pageIndex)")
87
+ }
88
+
89
+ guard let ctx = NSGraphicsContext(bitmapImageRep: bitmapRep) else {
90
+ fail("Cannot create graphics context for page \(pageIndex)")
91
+ }
92
+
93
+ // Fill white background (PDFs are transparent by default)
94
+ NSGraphicsContext.saveGraphicsState()
95
+ NSGraphicsContext.current = ctx
96
+ NSColor.white.setFill()
97
+ NSRect(x: 0, y: 0, width: width, height: height).fill()
98
+
99
+ ctx.cgContext.scaleBy(x: scale, y: scale)
100
+ page.draw(with: .mediaBox, to: ctx.cgContext)
101
+ NSGraphicsContext.restoreGraphicsState()
102
+
103
+ guard let pngData = bitmapRep.representation(using: .png, properties: [:]) else {
104
+ fail("Cannot encode page \(pageIndex) to PNG")
105
+ }
106
+
107
+ // Zero-pad page number to 3 digits: page-001.png, page-002.png, …
108
+ let filename = String(format: "%@-page-%03d.png", basename, pageIndex + 1)
109
+ let outPath = outDir.appendingPathComponent(filename)
110
+
111
+ do {
112
+ try pngData.write(to: outPath)
113
+ } catch {
114
+ fail("Cannot write \(outPath.path): \(error.localizedDescription)")
115
+ }
116
+
117
+ results.append(PageResult(page: pageIndex, path: outPath.path))
118
+ }
119
+
120
+ // ─── Output JSON ──────────────────────────────────────────────────────────────
121
+
122
+ print(encodeJSON(results))
@@ -0,0 +1,241 @@
1
+ import Vision
2
+ import AppKit
3
+ import Foundation
4
+
5
+ // ─── Result structs ──────────────────────────────────────────────────────────
6
+
7
+ struct OCRResult: Codable {
8
+ let t: String
9
+ let x: Double; let y: Double; let w: Double; let h: Double
10
+ let confidence: Float
11
+ }
12
+
13
+ struct FaceResult: Codable {
14
+ let x: Double; let y: Double; let w: Double; let h: Double
15
+ let confidence: Float
16
+ }
17
+
18
+ struct BarcodeResult: Codable {
19
+ let type: String
20
+ let value: String
21
+ let x: Double; let y: Double; let w: Double; let h: Double
22
+ let confidence: Float
23
+ }
24
+
25
+ struct RectangleResult: Codable {
26
+ let topLeft: [Double]; let topRight: [Double]
27
+ let bottomLeft: [Double]; let bottomRight: [Double]
28
+ let confidence: Float
29
+ }
30
+
31
+ struct DocumentResult: Codable {
32
+ let topLeft: [Double]; let topRight: [Double]
33
+ let bottomLeft: [Double]; let bottomRight: [Double]
34
+ let confidence: Float
35
+ }
36
+
37
+ struct ClassificationResult: Codable {
38
+ let identifier: String
39
+ let confidence: Float
40
+ }
41
+
42
+ // ─── Helpers ─────────────────────────────────────────────────────────────────
43
+
44
+ // Vision: 0,0 = bottom-left. We flip Y so 0,0 = top-left (web standard).
45
+ func flipY(_ y: Double, _ h: Double) -> Double { 1.0 - y - h }
46
+
47
+ func pt(_ p: CGPoint) -> [Double] { [Double(p.x), 1.0 - Double(p.y)] }
48
+
49
+ func encodeJSON<T: Encodable>(_ value: T) -> String {
50
+ guard let data = try? JSONEncoder().encode(value),
51
+ let str = String(data: data, encoding: .utf8) else { return "[]" }
52
+ return str
53
+ }
54
+
55
+ // ─── Argument parsing ─────────────────────────────────────────────────────────
56
+
57
+ let args = CommandLine.arguments
58
+ let isJsonMode = args.contains("--json")
59
+ let isFaces = args.contains("--faces")
60
+ let isBarcodes = args.contains("--barcodes")
61
+ let isRectangles = args.contains("--rectangles")
62
+ let isDocument = args.contains("--document")
63
+ let isClassify = args.contains("--classify")
64
+
65
+ let fileArgs = args.filter { !$0.hasPrefix("--") && !$0.contains("vision-helper") }
66
+
67
+ guard let imagePath = fileArgs.first else {
68
+ print("Usage: vision-helper [--json|--faces|--barcodes|--rectangles|--document|--classify] <path>")
69
+ exit(0)
70
+ }
71
+
72
+ guard let image = NSImage(contentsOf: URL(fileURLWithPath: imagePath)),
73
+ let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
74
+ fputs("ERROR: Cannot open file: \(imagePath)\n", stderr)
75
+ exit(1)
76
+ }
77
+
78
+ let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
79
+
80
+ // ─── OCR (default + --json) ───────────────────────────────────────────────────
81
+
82
+ if isJsonMode || (!isFaces && !isBarcodes && !isRectangles && !isDocument && !isClassify) {
83
+ var ocrResults: [OCRResult] = []
84
+ var rawText = ""
85
+
86
+ let request = VNRecognizeTextRequest { (req, _) in
87
+ guard let obs = req.results as? [VNRecognizedTextObservation] else { return }
88
+ for o in obs {
89
+ guard let c = o.topCandidates(1).first else { continue }
90
+ let box = o.boundingBox
91
+ if isJsonMode {
92
+ ocrResults.append(OCRResult(
93
+ t: c.string,
94
+ x: Double(box.origin.x),
95
+ y: flipY(Double(box.origin.y), Double(box.size.height)),
96
+ w: Double(box.size.width),
97
+ h: Double(box.size.height),
98
+ confidence: c.confidence
99
+ ))
100
+ } else {
101
+ rawText += c.string + "\n"
102
+ }
103
+ }
104
+ }
105
+ request.recognitionLevel = .accurate
106
+
107
+ do {
108
+ try handler.perform([request])
109
+ } catch {
110
+ fputs("ERROR: Vision OCR failed: \(error.localizedDescription)\n", stderr)
111
+ exit(1)
112
+ }
113
+ print(isJsonMode ? encodeJSON(ocrResults) : rawText.trimmingCharacters(in: .whitespacesAndNewlines))
114
+ exit(0)
115
+ }
116
+
117
+ // ─── Faces ───────────────────────────────────────────────────────────────────
118
+
119
+ if isFaces {
120
+ var results: [FaceResult] = []
121
+ let request = VNDetectFaceRectanglesRequest { (req, _) in
122
+ guard let obs = req.results as? [VNFaceObservation] else { return }
123
+ for o in obs {
124
+ let box = o.boundingBox
125
+ results.append(FaceResult(
126
+ x: Double(box.origin.x),
127
+ y: flipY(Double(box.origin.y), Double(box.size.height)),
128
+ w: Double(box.size.width),
129
+ h: Double(box.size.height),
130
+ confidence: o.confidence
131
+ ))
132
+ }
133
+ }
134
+ do {
135
+ try handler.perform([request])
136
+ } catch {
137
+ fputs("ERROR: Vision face detection failed: \(error.localizedDescription)\n", stderr)
138
+ exit(1)
139
+ }
140
+ print(encodeJSON(results))
141
+ exit(0)
142
+ }
143
+
144
+ // ─── Barcodes ────────────────────────────────────────────────────────────────
145
+
146
+ if isBarcodes {
147
+ var results: [BarcodeResult] = []
148
+ let request = VNDetectBarcodesRequest { (req, _) in
149
+ guard let obs = req.results as? [VNBarcodeObservation] else { return }
150
+ for o in obs {
151
+ let box = o.boundingBox
152
+ results.append(BarcodeResult(
153
+ type: o.symbology.rawValue,
154
+ value: o.payloadStringValue ?? "",
155
+ x: Double(box.origin.x),
156
+ y: flipY(Double(box.origin.y), Double(box.size.height)),
157
+ w: Double(box.size.width),
158
+ h: Double(box.size.height),
159
+ confidence: o.confidence
160
+ ))
161
+ }
162
+ }
163
+ do {
164
+ try handler.perform([request])
165
+ } catch {
166
+ fputs("ERROR: Vision barcode detection failed: \(error.localizedDescription)\n", stderr)
167
+ exit(1)
168
+ }
169
+ print(encodeJSON(results))
170
+ exit(0)
171
+ }
172
+
173
+ // ─── Rectangles ──────────────────────────────────────────────────────────────
174
+
175
+ if isRectangles {
176
+ var results: [RectangleResult] = []
177
+ let request = VNDetectRectanglesRequest { (req, _) in
178
+ guard let obs = req.results as? [VNRectangleObservation] else { return }
179
+ for o in obs {
180
+ results.append(RectangleResult(
181
+ topLeft: pt(o.topLeft), topRight: pt(o.topRight),
182
+ bottomLeft: pt(o.bottomLeft), bottomRight: pt(o.bottomRight),
183
+ confidence: o.confidence
184
+ ))
185
+ }
186
+ }
187
+ (request as VNDetectRectanglesRequest).maximumObservations = 0
188
+ do {
189
+ try handler.perform([request])
190
+ } catch {
191
+ fputs("ERROR: Vision rectangle detection failed: \(error.localizedDescription)\n", stderr)
192
+ exit(1)
193
+ }
194
+ print(encodeJSON(results))
195
+ exit(0)
196
+ }
197
+
198
+ // ─── Document ────────────────────────────────────────────────────────────────
199
+
200
+ if isDocument {
201
+ var results: [DocumentResult] = []
202
+ let request = VNDetectDocumentSegmentationRequest { (req, _) in
203
+ guard let obs = req.results as? [VNRectangleObservation] else { return }
204
+ for o in obs {
205
+ results.append(DocumentResult(
206
+ topLeft: pt(o.topLeft), topRight: pt(o.topRight),
207
+ bottomLeft: pt(o.bottomLeft), bottomRight: pt(o.bottomRight),
208
+ confidence: o.confidence
209
+ ))
210
+ }
211
+ }
212
+ do {
213
+ try handler.perform([request])
214
+ } catch {
215
+ fputs("ERROR: Vision document detection failed: \(error.localizedDescription)\n", stderr)
216
+ exit(1)
217
+ }
218
+ print(encodeJSON(results))
219
+ exit(0)
220
+ }
221
+
222
+ // ─── Classify ────────────────────────────────────────────────────────────────
223
+
224
+ if isClassify {
225
+ var results: [ClassificationResult] = []
226
+ let request = VNClassifyImageRequest { (req, _) in
227
+ guard let obs = req.results as? [VNClassificationObservation] else { return }
228
+ let top = obs.filter { $0.confidence > 0.01 }.prefix(10)
229
+ for o in top {
230
+ results.append(ClassificationResult(identifier: o.identifier, confidence: o.confidence))
231
+ }
232
+ }
233
+ do {
234
+ try handler.perform([request])
235
+ } catch {
236
+ fputs("ERROR: Vision classification failed: \(error.localizedDescription)\n", stderr)
237
+ exit(1)
238
+ }
239
+ print(encodeJSON(results))
240
+ exit(0)
241
+ }
package/.husky/commit-msg DELETED
@@ -1,2 +0,0 @@
1
- #!/bin/sh
2
- npx --no -- commitlint --edit $1
package/.husky/pre-commit DELETED
@@ -1,3 +0,0 @@
1
- #!/bin/sh
2
- npx lint-staged
3
- npx tsc --noEmit
package/.prettierignore DELETED
@@ -1,4 +0,0 @@
1
- dist/
2
- node_modules/
3
- bin/
4
- *.md
package/.prettierrc.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "semi": true,
3
- "singleQuote": true,
4
- "trailingComma": "es5",
5
- "printWidth": 100,
6
- "tabWidth": 2
7
- }
package/.release-it.json DELETED
@@ -1,20 +0,0 @@
1
- {
2
- "$schema": "https://unpkg.com/release-it/schema/release-it.json",
3
- "plugins": {
4
- "@release-it/conventional-changelog": {
5
- "preset": "conventionalcommits",
6
- "infile": "CHANGELOG.md"
7
- }
8
- },
9
- "git": {
10
- "commitMessage": "chore(release): v${version}",
11
- "tagName": "v${version}"
12
- },
13
- "github": {
14
- "release": false
15
- },
16
- "npm": {
17
- "publish": true,
18
- "tag": "latest"
19
- }
20
- }
package/CHANGELOG.md DELETED
@@ -1,38 +0,0 @@
1
- # Changelog
2
-
3
- ## [1.1.0](https://github.com/woladi/macos-vision/compare/v1.0.3...v1.1.0) (2026-04-09)
4
-
5
- ### Features
6
-
7
- * add PDF support via sips rasterization ([a48bf17](https://github.com/woladi/macos-vision/commit/a48bf17579a6df11aed6eadbde4fa5041ccaa981))
8
-
9
- ## [1.0.3](https://github.com/woladi/macos-vision/compare/v1.0.2...v1.0.3) (2026-04-08)
10
-
11
- ### Reverts
12
-
13
- * remove socket.ignore field — worsens supply chain risk score ([a1827ad](https://github.com/woladi/macos-vision/commit/a1827ad489220ebb7a2e8c85632945fe969438db))
14
-
15
- ## [1.0.2](https://github.com/woladi/macos-vision/compare/v1.0.1...v1.0.2) (2026-04-08)
16
-
17
- ## [1.0.1](https://github.com/woladi/macos-vision/compare/v0.3.1...v1.0.1) (2026-04-08)
18
-
19
- ## [0.3.1](https://github.com/woladi/macos-vision/compare/v0.3.0...v0.3.1) (2026-04-08)
20
-
21
- ## [0.3.0](https://github.com/woladi/macos-vision/compare/v0.2.0...v0.3.0) (2026-04-08)
22
-
23
- ### Features
24
-
25
- * add inferLayout() — unified reading-order LayoutBlock representation ([aec507e](https://github.com/woladi/macos-vision/commit/aec507eb7cf133ec1e56759c0945563a48d871ee))
26
-
27
- ## [0.2.0](https://github.com/woladi/macos-vision/compare/v0.1.4...v0.2.0) (2026-04-08)
28
-
29
- ### Features
30
-
31
- * add confidence to VisionBlock and Barcode ([a87df27](https://github.com/woladi/macos-vision/commit/a87df275e51dec4b57fbff6e3bffc4220b96b4d7))
32
-
33
- ### Bug Fixes
34
-
35
- * correct mkdirSync, CLI error on missing file, execFile timeout, README scope ([1cef2c7](https://github.com/woladi/macos-vision/commit/1cef2c7078430c9182fcd39792cf0c002833203f))
36
- * replace try? with do/catch in Swift helper — surface Vision errors properly ([f287065](https://github.com/woladi/macos-vision/commit/f2870655225806070be3db462ea15923201fecbf))
37
-
38
- ## 0.1.4 (2026-04-08)
@@ -1 +0,0 @@
1
- export default { extends: ['@commitlint/config-conventional'] };
package/debug.js DELETED
@@ -1,37 +0,0 @@
1
- #!/usr/bin/env node
2
- import {
3
- ocr,
4
- detectFaces,
5
- detectBarcodes,
6
- detectRectangles,
7
- detectDocument,
8
- classify,
9
- inferLayout,
10
- } from './dist/index.js';
11
-
12
- const imagePath = process.argv[2] || './test/fixtures/sample.png';
13
- console.log(`\n📸 Analyzing: ${imagePath}\n`);
14
-
15
- const [text, blocks, faces, barcodes, rects, doc, labels] = await Promise.all([
16
- ocr(imagePath),
17
- ocr(imagePath, { format: 'blocks' }),
18
- detectFaces(imagePath),
19
- detectBarcodes(imagePath),
20
- detectRectangles(imagePath),
21
- detectDocument(imagePath),
22
- classify(imagePath),
23
- ]);
24
-
25
- const sep = (title) => console.log('\n' + '─'.repeat(60) + '\n' + title + '\n');
26
-
27
- sep('📝 OCR text'); console.log(text);
28
- sep('📝 OCR blocks'); console.log(JSON.stringify(blocks, null, 2));
29
- sep('👤 Faces'); console.log(JSON.stringify(faces, null, 2));
30
- sep('🔲 Barcodes'); console.log(JSON.stringify(barcodes, null, 2));
31
- sep('📦 Rectangles'); console.log(JSON.stringify(rects, null, 2));
32
- sep('📄 Document'); console.log(JSON.stringify(doc, null, 2));
33
- sep('🏷️ Classification'); console.log(JSON.stringify(labels, null, 2));
34
-
35
- const layout = inferLayout({ textBlocks: blocks, faces, barcodes, rectangles: rects, document: doc });
36
- sep('🗂️ Layout (reading order)'); console.log(JSON.stringify(layout, null, 2));
37
- console.log('\n' + '─'.repeat(60) + '\n');
package/eslint.config.js DELETED
@@ -1,21 +0,0 @@
1
- import tseslint from 'typescript-eslint';
2
- import prettier from 'eslint-config-prettier';
3
-
4
- export default tseslint.config(
5
- ...tseslint.configs.recommended,
6
- prettier,
7
- {
8
- files: ['src/**/*.ts'],
9
- languageOptions: {
10
- parser: tseslint.parser,
11
- parserOptions: {
12
- project: true, // Szuka najbliższego tsconfig.json
13
- },
14
- },
15
- rules: {
16
- '@typescript-eslint/no-explicit-any': 'warn',
17
- '@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }],
18
- },
19
- },
20
- { ignores: ['dist/**', 'node_modules/**', 'bin/**'] }
21
- );