@hoshomoh/react-native-document-scanner 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/DocumentScanner.podspec +22 -0
- package/LICENSE +20 -0
- package/README.md +384 -0
- package/android/build.gradle +72 -0
- package/android/gradle.properties +17 -0
- package/android/local.properties +8 -0
- package/android/src/main/AndroidManifest.xml +8 -0
- package/android/src/main/java/com/documentscanner/DocumentScannerModule.kt +217 -0
- package/android/src/main/java/com/documentscanner/DocumentScannerPackage.kt +39 -0
- package/android/src/main/java/com/documentscanner/ImageProcessor.kt +325 -0
- package/android/src/main/java/com/documentscanner/Logger.kt +36 -0
- package/android/src/main/java/com/documentscanner/OCRConfiguration.kt +56 -0
- package/android/src/main/java/com/documentscanner/Options.kt +109 -0
- package/android/src/main/java/com/documentscanner/ScannerError.kt +18 -0
- package/android/src/main/java/com/documentscanner/TextRecognizer.kt +56 -0
- package/android/src/main/java/com/documentscanner/TextRecognizerV1.kt +68 -0
- package/android/src/main/java/com/documentscanner/TextRecognizerV2.kt +244 -0
- package/ios/DocumentScanner.h +5 -0
- package/ios/DocumentScanner.mm +113 -0
- package/ios/DocumentScannerManager.swift +148 -0
- package/ios/Errors.swift +33 -0
- package/ios/ImageProcessor.swift +78 -0
- package/ios/ImageUtil.swift +279 -0
- package/ios/Logger.swift +43 -0
- package/ios/OCRConfiguration.swift +60 -0
- package/ios/Options.swift +109 -0
- package/ios/ResponseUtil.swift +25 -0
- package/ios/ScanModels.swift +84 -0
- package/ios/TextRecognizer.swift +134 -0
- package/ios/TextRecognizerV1.swift +56 -0
- package/ios/TextRecognizerV2.swift +169 -0
- package/lib/module/NativeDocumentScanner.js +51 -0
- package/lib/module/NativeDocumentScanner.js.map +1 -0
- package/lib/module/index.js +40 -0
- package/lib/module/index.js.map +1 -0
- package/lib/module/package.json +1 -0
- package/lib/module/textReconstructor.js +147 -0
- package/lib/module/textReconstructor.js.map +1 -0
- package/lib/typescript/package.json +1 -0
- package/lib/typescript/src/NativeDocumentScanner.d.ts +191 -0
- package/lib/typescript/src/NativeDocumentScanner.d.ts.map +1 -0
- package/lib/typescript/src/index.d.ts +34 -0
- package/lib/typescript/src/index.d.ts.map +1 -0
- package/lib/typescript/src/textReconstructor.d.ts +60 -0
- package/lib/typescript/src/textReconstructor.d.ts.map +1 -0
- package/package.json +137 -0
- package/src/NativeDocumentScanner.ts +205 -0
- package/src/index.ts +61 -0
- package/src/textReconstructor.ts +212 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import UIKit
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
Base options shared by both Scan and Process operations.
|
|
6
|
+
Contains configuration for image output and OCR processing.
|
|
7
|
+
*/
|
|
8
|
+
public class BaseOptions {
|
|
9
|
+
public let quality: CGFloat
|
|
10
|
+
public let format: String
|
|
11
|
+
public let filter: String
|
|
12
|
+
public let includeBase64: Bool
|
|
13
|
+
public let includeText: Bool
|
|
14
|
+
public let textVersion: Int
|
|
15
|
+
|
|
16
|
+
init(quality: CGFloat, format: String, filter: String, includeBase64: Bool, includeText: Bool, textVersion: Int) {
|
|
17
|
+
self.quality = quality
|
|
18
|
+
self.format = format
|
|
19
|
+
self.filter = filter
|
|
20
|
+
self.includeBase64 = includeBase64
|
|
21
|
+
self.includeText = includeText
|
|
22
|
+
self.textVersion = textVersion
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
Reads an integer option from the bridge dictionary.
|
|
27
|
+
Covers NSNumber, Int, and Double — all three bridging representations that can appear
|
|
28
|
+
depending on whether the call comes through the old bridge or JSI (new arch).
|
|
29
|
+
*/
|
|
30
|
+
static func intOption(from dictionary: [String: Any]?, key: String, fallback: Int) -> Int {
|
|
31
|
+
guard let raw = dictionary?[key] else { return fallback }
|
|
32
|
+
if let n = raw as? NSNumber { return n.intValue }
|
|
33
|
+
if let i = raw as? Int { return i }
|
|
34
|
+
if let d = raw as? Double { return Int(d) }
|
|
35
|
+
return fallback
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
Convenience initializer to parse common options from a dictionary.
|
|
40
|
+
- Parameters:
|
|
41
|
+
- dictionary: Raw options dictionary.
|
|
42
|
+
- defaultIncludeText: Default value for includeText (Scan defaults to false, Process to true).
|
|
43
|
+
*/
|
|
44
|
+
init(from dictionary: [String: Any]?, defaultIncludeText: Bool) {
|
|
45
|
+
/* Quality: Clamp [0.1, 1.0] */
|
|
46
|
+
let q = dictionary?["quality"] as? CGFloat ?? 1.0
|
|
47
|
+
self.quality = max(0.1, min(1.0, q))
|
|
48
|
+
|
|
49
|
+
/* Format: whitelist [jpg, png] */
|
|
50
|
+
let f = dictionary?["format"] as? String ?? "jpg"
|
|
51
|
+
self.format = (f == "png") ? "png" : "jpg"
|
|
52
|
+
|
|
53
|
+
/* Filter: whitelist supported types */
|
|
54
|
+
let filterInput = dictionary?["filter"] as? String ?? "color"
|
|
55
|
+
let validFilters = ["color", "grayscale", "monochrome", "denoise", "sharpen", "ocrOptimized"]
|
|
56
|
+
self.filter = validFilters.contains(filterInput) ? filterInput : "color"
|
|
57
|
+
|
|
58
|
+
self.includeBase64 = dictionary?["includeBase64"] as? Bool ?? false
|
|
59
|
+
self.includeText = dictionary?["includeText"] as? Bool ?? defaultIncludeText
|
|
60
|
+
|
|
61
|
+
/* Text Version: allow [1, 2]. */
|
|
62
|
+
let rawVersion = BaseOptions.intOption(from: dictionary, key: "textVersion", fallback: 2)
|
|
63
|
+
self.textVersion = (rawVersion == 1) ? 1 : 2
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
Strongly-typed representation of scan options.
|
|
69
|
+
Parses the raw dictionary from React Native and provides defaults.
|
|
70
|
+
*/
|
|
71
|
+
public class ScanOptions: BaseOptions {
|
|
72
|
+
public let maxPageCount: Int
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
Initializes ScanOptions from a raw dictionary.
|
|
76
|
+
- Parameters:
|
|
77
|
+
- dictionary: The options dictionary from React Native.
|
|
78
|
+
- fallbackPageCount: Default page count if not specified.
|
|
79
|
+
*/
|
|
80
|
+
public init(from dictionary: [String: Any]?, fallbackPageCount: Int) {
|
|
81
|
+
/* Max Page Count: Clamp [0, 100]. 0 = unlimited. */
|
|
82
|
+
let rawMax = BaseOptions.intOption(from: dictionary, key: "maxPageCount", fallback: fallbackPageCount)
|
|
83
|
+
self.maxPageCount = max(0, min(100, rawMax))
|
|
84
|
+
|
|
85
|
+
super.init(from: dictionary, defaultIncludeText: false)
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
Strongly-typed representation of process options.
|
|
91
|
+
Parses the raw dictionary from React Native for processDocuments.
|
|
92
|
+
*/
|
|
93
|
+
public class ProcessOptions: BaseOptions {
|
|
94
|
+
public let images: [String]
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
Initializes ProcessOptions from a raw dictionary.
|
|
98
|
+
- Parameter dictionary: The options dictionary from React Native.
|
|
99
|
+
- Returns: nil if 'images' array is missing.
|
|
100
|
+
*/
|
|
101
|
+
public init?(from dictionary: [String: Any]?) {
|
|
102
|
+
guard let dict = dictionary,
|
|
103
|
+
let images = dict["images"] as? [String] else {
|
|
104
|
+
return nil
|
|
105
|
+
}
|
|
106
|
+
self.images = images
|
|
107
|
+
super.init(from: dictionary, defaultIncludeText: true)
|
|
108
|
+
}
|
|
109
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import UIKit
|
|
3
|
+
|
|
4
|
+
/** Utility class for constructing the response objects for React Native. */
|
|
5
|
+
public class ResponseUtil {
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
Constructs the ScanResult struct.
|
|
9
|
+
- Parameters:
|
|
10
|
+
- uri: The local file path.
|
|
11
|
+
- text: Optional OCR text.
|
|
12
|
+
- blocks: Optional OCR blocks.
|
|
13
|
+
- base64: Optional Base64 string.
|
|
14
|
+
- Returns: A `ScanResult` struct.
|
|
15
|
+
*/
|
|
16
|
+
public static func buildResult(uri: String?, base64: String?, text: String?, blocks: [TextBlock]?, metadata: ScanMetadata) -> ScanResult {
|
|
17
|
+
return ScanResult(
|
|
18
|
+
uri: uri,
|
|
19
|
+
base64: base64,
|
|
20
|
+
text: text,
|
|
21
|
+
blocks: blocks,
|
|
22
|
+
metadata: metadata
|
|
23
|
+
)
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
|
|
3
|
+
/** Describes which OCR engine and configuration produced a ScanResult. */
|
|
4
|
+
public struct ScanMetadata: Encodable {
|
|
5
|
+
/// Platform identifier. Always "ios".
|
|
6
|
+
public let platform: String
|
|
7
|
+
/// OCR version requested (1 = Raw, 2 = Heuristic / RecognizeDocuments on iOS 26+).
|
|
8
|
+
public let textVersion: Int
|
|
9
|
+
/// Image filter that was applied to the image before OCR.
|
|
10
|
+
public let filter: String
|
|
11
|
+
/// The specific OCR engine used, or "none" if OCR was not requested.
|
|
12
|
+
/// - "RecognizeDocumentsRequest": iOS 26+ native document understanding (V2).
|
|
13
|
+
/// - "VNRecognizeTextRequest": Vision framework text request (V1 or V2 on iOS < 26).
|
|
14
|
+
/// - "none": OCR was not performed (includeText was false).
|
|
15
|
+
public let ocrEngine: String
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/** Represents the geometric bounds of a text block in normalized coordinates (0.0 - 1.0). */
|
|
19
|
+
public struct Frame: Encodable {
|
|
20
|
+
/// Horizontal position of the top-left corner.
|
|
21
|
+
public let x: Double
|
|
22
|
+
/// Vertical position of the top-left corner.
|
|
23
|
+
public let y: Double
|
|
24
|
+
/// Width of the bounding box.
|
|
25
|
+
public let width: Double
|
|
26
|
+
/// Height of the bounding box.
|
|
27
|
+
public let height: Double
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** Represents a recognized block of text with its position and confidence level. */
|
|
31
|
+
public struct TextBlock: Encodable {
|
|
32
|
+
/// The recognized text string.
|
|
33
|
+
public let text: String
|
|
34
|
+
/// The bounding box of the text.
|
|
35
|
+
public let frame: Frame
|
|
36
|
+
/// The confidence level of the recognition (0.0 - 1.0).
|
|
37
|
+
public let confidence: Double?
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Represents the final result of a scanned page. */
|
|
41
|
+
public struct ScanResult: Encodable {
|
|
42
|
+
public let uri: String?
|
|
43
|
+
public let base64: String?
|
|
44
|
+
public let text: String?
|
|
45
|
+
public let blocks: [TextBlock]?
|
|
46
|
+
public let metadata: ScanMetadata
|
|
47
|
+
|
|
48
|
+
/** Converts the struct to a Dictionary for React Native bridge. */
|
|
49
|
+
public var dictionary: [String: Any] {
|
|
50
|
+
var dict: [String: Any] = [:]
|
|
51
|
+
|
|
52
|
+
if let uri = uri { dict["uri"] = uri }
|
|
53
|
+
if let base64 = base64 { dict["base64"] = base64 }
|
|
54
|
+
if let text = text { dict["text"] = text }
|
|
55
|
+
|
|
56
|
+
if let blocks = blocks {
|
|
57
|
+
/* Manually map blocks to ensure correct structure. */
|
|
58
|
+
dict["blocks"] = blocks.map { block in
|
|
59
|
+
var blockDict: [String: Any] = [
|
|
60
|
+
"text": block.text,
|
|
61
|
+
"frame": [
|
|
62
|
+
"x": block.frame.x,
|
|
63
|
+
"y": block.frame.y,
|
|
64
|
+
"width": block.frame.width,
|
|
65
|
+
"height": block.frame.height
|
|
66
|
+
]
|
|
67
|
+
]
|
|
68
|
+
if let confidence = block.confidence {
|
|
69
|
+
blockDict["confidence"] = confidence
|
|
70
|
+
}
|
|
71
|
+
return blockDict
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
dict["metadata"] = [
|
|
76
|
+
"platform": metadata.platform,
|
|
77
|
+
"textVersion": metadata.textVersion,
|
|
78
|
+
"filter": metadata.filter,
|
|
79
|
+
"ocrEngine": metadata.ocrEngine
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
return dict
|
|
83
|
+
}
|
|
84
|
+
}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import Vision
|
|
3
|
+
import UIKit
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
A robust utility class for performing Optical Character Recognition (OCR)
|
|
7
|
+
using Apple's Vision framework.
|
|
8
|
+
|
|
9
|
+
Acts as a Facade delegating to versioned implementations:
|
|
10
|
+
- TextRecognizerV1 (Raw)
|
|
11
|
+
- TextRecognizerV2 (Heuristic, or native document understanding on iOS 26+)
|
|
12
|
+
*/
|
|
13
|
+
@available(iOS 13.0, *)
|
|
14
|
+
public class TextRecognizer {
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
Extracts text from an image using Apple's Vision Framework.
|
|
18
|
+
|
|
19
|
+
- Parameter image: `UIImage` object to process.
|
|
20
|
+
- Parameter version: OCR engine version (1 = Raw, 2 = Heuristic).
|
|
21
|
+
- Returns: A tuple containing the structured text and raw blocks.
|
|
22
|
+
*/
|
|
23
|
+
public static func recognizeText(from image: UIImage, version: Int = 2) async -> (text: String, blocks: [TextBlock])? {
|
|
24
|
+
|
|
25
|
+
guard let cgImage = image.cgImage else {
|
|
26
|
+
Logger.warn("Could not retrieve CGImage from input.")
|
|
27
|
+
return nil
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/* iOS 26+: Use RecognizeDocumentsRequest for V2 — native document structure, no heuristics needed */
|
|
31
|
+
if version == 2, #available(iOS 26.0, *) {
|
|
32
|
+
return await recognizeWithDocumentRequest(cgImage: cgImage)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/* Configure the Vision Request */
|
|
36
|
+
let request = VNRecognizeTextRequest()
|
|
37
|
+
request.recognitionLevel = .accurate
|
|
38
|
+
|
|
39
|
+
/*
|
|
40
|
+
V1 (Raw) uses standard language correction for general text.
|
|
41
|
+
V2 (Heuristic) disables it to preserve document layout and prevent over-merging.
|
|
42
|
+
*/
|
|
43
|
+
request.usesLanguageCorrection = (version == 1)
|
|
44
|
+
|
|
45
|
+
/* Filter out noise and tiny text artifacts */
|
|
46
|
+
request.minimumTextHeight = 0.01
|
|
47
|
+
|
|
48
|
+
/* Enable automatic language detection for multilingual documents (iOS 16+) */
|
|
49
|
+
if #available(iOS 16.0, *) {
|
|
50
|
+
request.automaticallyDetectsLanguage = true
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
|
54
|
+
|
|
55
|
+
do {
|
|
56
|
+
/* Perform text recognition — blocking CPU-bound call, safe on background task */
|
|
57
|
+
try handler.perform([request])
|
|
58
|
+
|
|
59
|
+
guard let observations = request.results else {
|
|
60
|
+
Logger.info("No text found in image.")
|
|
61
|
+
return nil
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if version == 1 {
|
|
65
|
+
return TextRecognizerV1.recognize(observations)
|
|
66
|
+
} else {
|
|
67
|
+
return TextRecognizerV2.recognize(observations)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
} catch {
|
|
71
|
+
Logger.error("Text recognition request failed: \(error.localizedDescription)")
|
|
72
|
+
return nil
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
Returns the name of the OCR engine that will be used for the given version on the current OS.
|
|
78
|
+
Used by ImageProcessor to populate ScanResult metadata.
|
|
79
|
+
*/
|
|
80
|
+
public static func engineName(for version: Int) -> String {
|
|
81
|
+
if version == 2, #available(iOS 26.0, *) {
|
|
82
|
+
return "RecognizeDocumentsRequest"
|
|
83
|
+
}
|
|
84
|
+
return "VNRecognizeTextRequest"
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
iOS 26+ fast path using RecognizeDocumentsRequest.
|
|
89
|
+
Returns native document structure (paragraphs) mapped to the same
|
|
90
|
+
(text, blocks) format used by the heuristic path, so callers see no difference.
|
|
91
|
+
One block per paragraph — for structured documents each paragraph is typically one visual line.
|
|
92
|
+
*/
|
|
93
|
+
@available(iOS 26.0, *)
|
|
94
|
+
private static func recognizeWithDocumentRequest(cgImage: CGImage) async -> (text: String, blocks: [TextBlock])? {
|
|
95
|
+
do {
|
|
96
|
+
let request = RecognizeDocumentsRequest()
|
|
97
|
+
let observations = try await request.perform(on: cgImage)
|
|
98
|
+
|
|
99
|
+
guard let document = observations.first?.document else {
|
|
100
|
+
Logger.info("No document structure found in image.")
|
|
101
|
+
return nil
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
var lines: [String] = []
|
|
105
|
+
var blocks: [TextBlock] = []
|
|
106
|
+
|
|
107
|
+
for paragraph in document.paragraphs {
|
|
108
|
+
let paragraphText = paragraph.transcript
|
|
109
|
+
lines.append(paragraphText)
|
|
110
|
+
|
|
111
|
+
/* Map bounding region to normalised top-left origin coordinates.
|
|
112
|
+
boundingRegion is NormalizedRegion (Contour) → .boundingBox gives NormalizedRect → .cgRect gives CGRect.
|
|
113
|
+
Vision uses bottom-left origin — convert to top-left: 1 - y - height */
|
|
114
|
+
let box = paragraph.boundingRegion.boundingBox.cgRect
|
|
115
|
+
let androidStyleY = 1.0 - box.origin.y - box.size.height
|
|
116
|
+
let frame = Frame(
|
|
117
|
+
x: Double(box.origin.x),
|
|
118
|
+
y: Double(androidStyleY),
|
|
119
|
+
width: Double(box.size.width),
|
|
120
|
+
height: Double(box.size.height)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
blocks.append(TextBlock(text: paragraphText, frame: frame, confidence: nil))
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
let text = lines.isEmpty ? "" : lines.joined(separator: "\n") + "\n"
|
|
127
|
+
return (text: text, blocks: blocks)
|
|
128
|
+
|
|
129
|
+
} catch {
|
|
130
|
+
Logger.error("RecognizeDocumentsRequest failed: \(error.localizedDescription)")
|
|
131
|
+
return nil
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import Vision
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
Version 1: Raw Output (Standard Vision Behavior)
|
|
6
|
+
*/
|
|
7
|
+
@available(iOS 13.0, *)
|
|
8
|
+
public class TextRecognizerV1 {
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
Performs raw text recognition from Vision observations.
|
|
12
|
+
- Parameter observations: Raw results from VNRecognizeTextRequest.
|
|
13
|
+
- Returns: Concatenated text and structured blocks.
|
|
14
|
+
*/
|
|
15
|
+
public static func recognize(_ observations: [VNRecognizedTextObservation]) -> (text: String, blocks: [TextBlock]) {
|
|
16
|
+
var fullText = ""
|
|
17
|
+
|
|
18
|
+
var blocks: [TextBlock] = []
|
|
19
|
+
|
|
20
|
+
for obs in observations {
|
|
21
|
+
guard let candidate = obs.topCandidates(1).first else { continue }
|
|
22
|
+
|
|
23
|
+
let box = obs.boundingBox
|
|
24
|
+
|
|
25
|
+
/*
|
|
26
|
+
Vision uses bottom-left origin.
|
|
27
|
+
Convert to Android-style top-left origin: 1.0 - y - height
|
|
28
|
+
*/
|
|
29
|
+
let androidStyleY = 1.0 - box.origin.y - box.size.height
|
|
30
|
+
|
|
31
|
+
let frame = Frame(
|
|
32
|
+
x: box.origin.x,
|
|
33
|
+
y: androidStyleY,
|
|
34
|
+
width: box.size.width,
|
|
35
|
+
height: box.size.height
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
blocks.append(TextBlock(
|
|
39
|
+
text: candidate.string,
|
|
40
|
+
frame: frame,
|
|
41
|
+
confidence: Double(candidate.confidence)
|
|
42
|
+
))
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/*
|
|
46
|
+
Sort top-to-bottom using the converted Y (top-left origin).
|
|
47
|
+
Smaller Y values are higher on the page.
|
|
48
|
+
*/
|
|
49
|
+
let sortedBlocks = blocks.sorted { $0.frame.y < $1.frame.y }
|
|
50
|
+
|
|
51
|
+
/* Concatenate text based on reading order sort */
|
|
52
|
+
fullText = sortedBlocks.map { $0.text }.joined(separator: "\n")
|
|
53
|
+
|
|
54
|
+
return (text: fullText, blocks: sortedBlocks)
|
|
55
|
+
}
|
|
56
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import Vision
|
|
3
|
+
import UIKit
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
Version 2: Heuristic Enhanced (Line Clustering for Layout Preservation)
|
|
7
|
+
*/
|
|
8
|
+
@available(iOS 13.0, *)
|
|
9
|
+
public class TextRecognizerV2 {
|
|
10
|
+
|
|
11
|
+
public static func recognize(_ observations: [VNRecognizedTextObservation]) -> (text: String, blocks: [TextBlock]) {
|
|
12
|
+
|
|
13
|
+
/* 1. LineCluster Strategy */
|
|
14
|
+
struct LineCluster {
|
|
15
|
+
var observations: [VNRecognizedTextObservation]
|
|
16
|
+
var unionBoundingBox: CGRect
|
|
17
|
+
var heights: [CGFloat]
|
|
18
|
+
var centerYs: [CGFloat]
|
|
19
|
+
|
|
20
|
+
var medianHeight: CGFloat {
|
|
21
|
+
let sorted = heights.sorted()
|
|
22
|
+
if sorted.isEmpty { return 0 }
|
|
23
|
+
let mid = sorted.count / 2
|
|
24
|
+
return sorted.count % 2 == 0
|
|
25
|
+
? (sorted[mid - 1] + sorted[mid]) / 2.0
|
|
26
|
+
: sorted[mid]
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
var medianCenterY: CGFloat {
|
|
30
|
+
let sorted = centerYs.sorted()
|
|
31
|
+
if sorted.isEmpty { return 0 }
|
|
32
|
+
let mid = sorted.count / 2
|
|
33
|
+
return sorted.count % 2 == 0
|
|
34
|
+
? (sorted[mid - 1] + sorted[mid]) / 2.0
|
|
35
|
+
: sorted[mid]
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/*
|
|
40
|
+
Sort observations top-to-bottom for clustering.
|
|
41
|
+
In RAW Vision space, higher on page means BIGGER Y.
|
|
42
|
+
*/
|
|
43
|
+
let sortedObservations = observations.sorted { $0.boundingBox.midY > $1.boundingBox.midY }
|
|
44
|
+
var clusters: [LineCluster] = []
|
|
45
|
+
|
|
46
|
+
for obs in sortedObservations {
|
|
47
|
+
let obsBox = obs.boundingBox
|
|
48
|
+
let obsHeight = obsBox.height
|
|
49
|
+
let obsCenterY = obsBox.midY
|
|
50
|
+
|
|
51
|
+
var bestClusterIndex: Int? = nil
|
|
52
|
+
var bestOverlapRatio: CGFloat = 0.0
|
|
53
|
+
var bestCenterDistance: CGFloat = .greatestFiniteMagnitude
|
|
54
|
+
|
|
55
|
+
for (index, cluster) in clusters.enumerated() {
|
|
56
|
+
let clusterBox = cluster.unionBoundingBox
|
|
57
|
+
|
|
58
|
+
/* Heuristic: Height Compatibility — use median height, not union bbox height */
|
|
59
|
+
let minH = min(cluster.medianHeight, obsHeight)
|
|
60
|
+
let maxH = max(cluster.medianHeight, obsHeight)
|
|
61
|
+
if (minH / maxH) < OCRConfiguration.heightCompatibilityThreshold { continue }
|
|
62
|
+
|
|
63
|
+
/* Heuristic: Overlap & Centerline — use median centerY, not union bbox midY */
|
|
64
|
+
let intersection = clusterBox.intersection(obsBox)
|
|
65
|
+
let overlapHeight = max(0, intersection.height)
|
|
66
|
+
let overlapRatio = overlapHeight / minH
|
|
67
|
+
|
|
68
|
+
let centerDistance = abs(cluster.medianCenterY - obsCenterY)
|
|
69
|
+
let typicalLineHeight = max(cluster.medianHeight, obsHeight)
|
|
70
|
+
|
|
71
|
+
let isOverlapGood = overlapRatio >= OCRConfiguration.overlapRatioThreshold
|
|
72
|
+
let isCenterClose = centerDistance <= (OCRConfiguration.centerlineDistanceFactor * typicalLineHeight)
|
|
73
|
+
|
|
74
|
+
if (isOverlapGood || isCenterClose) {
|
|
75
|
+
/* Heuristic: Adaptive Cluster Growth Constraint */
|
|
76
|
+
let intersectX = max(0, min(clusterBox.maxX, obsBox.maxX) - max(clusterBox.minX, obsBox.minX))
|
|
77
|
+
let isStacked = intersectX > 0
|
|
78
|
+
|
|
79
|
+
let growthLimit = isStacked ? OCRConfiguration.stackedGrowthLimit : OCRConfiguration.skewedGrowthLimit
|
|
80
|
+
|
|
81
|
+
let newUnion = clusterBox.union(obsBox)
|
|
82
|
+
if newUnion.height <= (CGFloat(growthLimit) * typicalLineHeight) {
|
|
83
|
+
/* Score this cluster */
|
|
84
|
+
if overlapRatio > bestOverlapRatio {
|
|
85
|
+
bestOverlapRatio = overlapRatio
|
|
86
|
+
bestCenterDistance = centerDistance
|
|
87
|
+
bestClusterIndex = index
|
|
88
|
+
} else if abs(overlapRatio - bestOverlapRatio) < 0.01 && centerDistance < bestCenterDistance {
|
|
89
|
+
bestCenterDistance = centerDistance
|
|
90
|
+
bestClusterIndex = index
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if let idx = bestClusterIndex {
|
|
97
|
+
clusters[idx].observations.append(obs)
|
|
98
|
+
clusters[idx].unionBoundingBox = clusters[idx].unionBoundingBox.union(obsBox)
|
|
99
|
+
clusters[idx].heights.append(obsHeight)
|
|
100
|
+
clusters[idx].centerYs.append(obsCenterY)
|
|
101
|
+
} else {
|
|
102
|
+
clusters.append(LineCluster(
|
|
103
|
+
observations: [obs],
|
|
104
|
+
unionBoundingBox: obsBox,
|
|
105
|
+
heights: [obsHeight],
|
|
106
|
+
centerYs: [obsCenterY]
|
|
107
|
+
))
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/*
|
|
112
|
+
Sort clusters top-to-bottom for final output.
|
|
113
|
+
In RAW Vision space, higher on page means BIGGER Y.
|
|
114
|
+
*/
|
|
115
|
+
clusters.sort { $0.unionBoundingBox.midY > $1.unionBoundingBox.midY }
|
|
116
|
+
|
|
117
|
+
/* 2. Column Reconstruction (Adaptive Spacing) + Cluster-Based Blocks */
|
|
118
|
+
var structuredText = ""
|
|
119
|
+
var clusterBlocks: [TextBlock] = []
|
|
120
|
+
|
|
121
|
+
for cluster in clusters {
|
|
122
|
+
/* Sort line elements left-to-right */
|
|
123
|
+
let lineObs = cluster.observations.sorted { $0.boundingBox.origin.x < $1.boundingBox.origin.x }
|
|
124
|
+
let medianH = cluster.medianHeight
|
|
125
|
+
|
|
126
|
+
var lineString = ""
|
|
127
|
+
var lastXEnd: CGFloat = 0.0
|
|
128
|
+
|
|
129
|
+
for (index, obs) in lineObs.enumerated() {
|
|
130
|
+
guard let candidate = obs.topCandidates(1).first else { continue }
|
|
131
|
+
let xStart = obs.boundingBox.origin.x
|
|
132
|
+
|
|
133
|
+
if index > 0 {
|
|
134
|
+
let gap = xStart - lastXEnd
|
|
135
|
+
/* Spacing Heuristic */
|
|
136
|
+
if gap > (medianH * CGFloat(OCRConfiguration.adaptiveSpacingFactor)) {
|
|
137
|
+
let spaceWidth = medianH * CGFloat(OCRConfiguration.spaceWidthFactor)
|
|
138
|
+
let spaces = max(1, Int(gap / spaceWidth))
|
|
139
|
+
lineString += String(repeating: " ", count: min(spaces, OCRConfiguration.maxSpaces))
|
|
140
|
+
} else {
|
|
141
|
+
lineString += " "
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
lineString += candidate.string
|
|
146
|
+
lastXEnd = xStart + obs.boundingBox.width
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
structuredText += lineString + "\n"
|
|
150
|
+
|
|
151
|
+
/* Build one block per cluster (line-level, aligned with text output) */
|
|
152
|
+
let unionBox = cluster.unionBoundingBox
|
|
153
|
+
let androidStyleY = 1.0 - unionBox.origin.y - unionBox.size.height
|
|
154
|
+
let frame = Frame(
|
|
155
|
+
x: Double(unionBox.origin.x),
|
|
156
|
+
y: Double(androidStyleY),
|
|
157
|
+
width: Double(unionBox.size.width),
|
|
158
|
+
height: Double(unionBox.size.height)
|
|
159
|
+
)
|
|
160
|
+
let confidences = cluster.observations.compactMap { obs -> Double? in
|
|
161
|
+
obs.topCandidates(1).first.map { Double($0.confidence) }
|
|
162
|
+
}
|
|
163
|
+
let avgConfidence = confidences.isEmpty ? nil : confidences.reduce(0.0, +) / Double(confidences.count)
|
|
164
|
+
clusterBlocks.append(TextBlock(text: lineString, frame: frame, confidence: avgConfidence))
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return (text: structuredText, blocks: clusterBlocks)
|
|
168
|
+
}
|
|
169
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
import { TurboModuleRegistry } from 'react-native';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Type union of all available filter values.
|
|
7
|
+
* Required for React Native Codegen compatibility.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Type union of all available format values.
|
|
12
|
+
* Required for React Native Codegen compatibility.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Represents a discrete block of text recognized by the OCR engine.
|
|
17
|
+
* Useful for mapping text to specific regions on the image.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Describes the OCR engine and configuration used to produce a ScanResult.
|
|
22
|
+
* Pass the parent `ScanResult` directly to `reconstructText` — it reads
|
|
23
|
+
* `metadata` internally to select the right reconstruction strategy.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* The result of a single scanned page.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Base configuration options shared by scan and process operations.
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Configuration options for the Document Scanner.
|
|
36
|
+
* Fields are listed explicitly (not via extends) for React Native Codegen compatibility —
|
|
37
|
+
* Codegen only generates struct fields declared directly on the interface.
|
|
38
|
+
*/
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Configuration options for processing existing images.
|
|
42
|
+
* Fields are listed explicitly (not via extends) for React Native Codegen compatibility —
|
|
43
|
+
* Codegen only generates struct fields declared directly on the interface.
|
|
44
|
+
*/
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* TurboModule Specification for the Document Scanner.
|
|
48
|
+
*/
|
|
49
|
+
|
|
50
|
+
export default TurboModuleRegistry.getEnforcing('DocumentScanner');
|
|
51
|
+
//# sourceMappingURL=NativeDocumentScanner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"names":["TurboModuleRegistry","getEnforcing"],"sourceRoot":"../../src","sources":["NativeDocumentScanner.ts"],"mappings":";;AAAA,SAASA,mBAAmB,QAA0B,cAAc;;AAEpE;AACA;AACA;AACA;;AASA;AACA;AACA;AACA;;AAGA;AACA;AACA;AACA;;AAkBA;AACA;AACA;AACA;AACA;;AAsBA;AACA;AACA;;AAcA;AACA;AACA;;AA4BA;AACA;AACA;AACA;AACA;;AA8BA;AACA;AACA;AACA;AACA;;AAkCA;AACA;AACA;;AAiBA,eAAeA,mBAAmB,CAACC,YAAY,CAAO,iBAAiB,CAAC","ignoreList":[]}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
import DocumentScanner from "./NativeDocumentScanner.js";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Available image filters.
|
|
7
|
+
* Use these constants instead of raw strings for type safety.
|
|
8
|
+
*/
|
|
9
|
+
export const Filter = {
|
|
10
|
+
/** No filter (original colors) */
|
|
11
|
+
COLOR: 'color',
|
|
12
|
+
/** Desaturated image */
|
|
13
|
+
GRAYSCALE: 'grayscale',
|
|
14
|
+
/** High-contrast black & white */
|
|
15
|
+
MONOCHROME: 'monochrome',
|
|
16
|
+
/** Noise reduction (for noisy photos) */
|
|
17
|
+
DENOISE: 'denoise',
|
|
18
|
+
/** Edge enhancement (for blurry text) */
|
|
19
|
+
SHARPEN: 'sharpen',
|
|
20
|
+
/** Full OCR pipeline: denoise → sharpen → monochrome */
|
|
21
|
+
OCR_OPTIMIZED: 'ocrOptimized'
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Available output formats.
|
|
26
|
+
*/
|
|
27
|
+
export const Format = {
|
|
28
|
+
/** JPEG format (smaller file size) */
|
|
29
|
+
JPG: 'jpg',
|
|
30
|
+
/** PNG format (lossless) */
|
|
31
|
+
PNG: 'png'
|
|
32
|
+
};
|
|
33
|
+
export function scanDocuments(options) {
|
|
34
|
+
return DocumentScanner.scanDocuments(options);
|
|
35
|
+
}
|
|
36
|
+
export function processDocuments(options) {
|
|
37
|
+
return DocumentScanner.processDocuments(options);
|
|
38
|
+
}
|
|
39
|
+
export { reconstructText } from "./textReconstructor.js";
|
|
40
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"names":["DocumentScanner","Filter","COLOR","GRAYSCALE","MONOCHROME","DENOISE","SHARPEN","OCR_OPTIMIZED","Format","JPG","PNG","scanDocuments","options","processDocuments","reconstructText"],"sourceRoot":"../../src","sources":["index.ts"],"mappings":";;AAAA,OAAOA,eAAe,MAQf,4BAAyB;;AAEhC;AACA;AACA;AACA;AACA,OAAO,MAAMC,MAAM,GAAG;EACpB;EACAC,KAAK,EAAE,OAAO;EACd;EACAC,SAAS,EAAE,WAAW;EACtB;EACAC,UAAU,EAAE,YAAY;EACxB;EACAC,OAAO,EAAE,SAAS;EAClB;EACAC,OAAO,EAAE,SAAS;EAClB;EACAC,aAAa,EAAE;AACjB,CAAU;;AAEV;AACA;AACA;AACA,OAAO,MAAMC,MAAM,GAAG;EACpB;EACAC,GAAG,EAAE,KAAK;EACV;EACAC,GAAG,EAAE;AACP,CAAU;AAEV,OAAO,SAASC,aAAaA,CAACC,OAAqB,EAAyB;EAC1E,OAAOZ,eAAe,CAACW,aAAa,CAACC,OAAO,CAAC;AAC/C;AAEA,OAAO,SAASC,gBAAgBA,CAC9BD,OAAuB,EACA;EACvB,OAAOZ,eAAe,CAACa,gBAAgB,CAACD,OAAO,CAAC;AAClD;AAEA,SAASE,eAAe,QAAQ,wBAAqB","ignoreList":[]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"type":"module"}
|