docling-core 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (46) hide show
  1. docling_core/__init__.py +6 -0
  2. docling_core/py.typed +0 -0
  3. docling_core/resources/schemas/doc/ANN.json +171 -0
  4. docling_core/resources/schemas/doc/DOC.json +300 -0
  5. docling_core/resources/schemas/doc/OCR-output.json +166 -0
  6. docling_core/resources/schemas/doc/RAW.json +158 -0
  7. docling_core/resources/schemas/generated/ccs_document_schema.json +1071 -0
  8. docling_core/resources/schemas/generated/minimal_document_schema_flat.json +1129 -0
  9. docling_core/resources/schemas/search/search_doc_mapping.json +104 -0
  10. docling_core/resources/schemas/search/search_doc_mapping_v2.json +256 -0
  11. docling_core/search/__init__.py +6 -0
  12. docling_core/search/json_schema_to_search_mapper.py +406 -0
  13. docling_core/search/mapping.py +29 -0
  14. docling_core/search/meta.py +93 -0
  15. docling_core/search/package.py +56 -0
  16. docling_core/types/__init__.py +25 -0
  17. docling_core/types/base.py +248 -0
  18. docling_core/types/doc/__init__.py +6 -0
  19. docling_core/types/doc/base.py +199 -0
  20. docling_core/types/doc/doc_ann.py +76 -0
  21. docling_core/types/doc/doc_ocr.py +83 -0
  22. docling_core/types/doc/doc_raw.py +187 -0
  23. docling_core/types/doc/document.py +393 -0
  24. docling_core/types/gen/__init__.py +6 -0
  25. docling_core/types/gen/generic.py +33 -0
  26. docling_core/types/nlp/__init__.py +6 -0
  27. docling_core/types/nlp/qa.py +74 -0
  28. docling_core/types/nlp/qa_labels.py +118 -0
  29. docling_core/types/rec/__init__.py +6 -0
  30. docling_core/types/rec/attribute.py +55 -0
  31. docling_core/types/rec/base.py +90 -0
  32. docling_core/types/rec/predicate.py +133 -0
  33. docling_core/types/rec/record.py +95 -0
  34. docling_core/types/rec/statement.py +41 -0
  35. docling_core/types/rec/subject.py +77 -0
  36. docling_core/utils/__init__.py +6 -0
  37. docling_core/utils/alias.py +27 -0
  38. docling_core/utils/ds_generate_docs.py +144 -0
  39. docling_core/utils/ds_generate_jsonschema.py +62 -0
  40. docling_core/utils/validate.py +86 -0
  41. docling_core/utils/validators.py +100 -0
  42. docling_core-0.0.1.dist-info/LICENSE +21 -0
  43. docling_core-0.0.1.dist-info/METADATA +133 -0
  44. docling_core-0.0.1.dist-info/RECORD +46 -0
  45. docling_core-0.0.1.dist-info/WHEEL +4 -0
  46. docling_core-0.0.1.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Main package."""
docling_core/py.typed ADDED
File without changes
@@ -0,0 +1,171 @@
1
+ {
2
+ "$schema": "http://json-schema.org/schema#",
3
+ "definitions": {
4
+ "annot+pred": {
5
+ "type": "array",
6
+ "items": {
7
+ "type": "object",
8
+ "required": [
9
+ "cells",
10
+ "clusters",
11
+ "tables",
12
+ "source"
13
+ ],
14
+ "properties": {
15
+ "cells": {
16
+ "type": "array",
17
+ "items": {
18
+ "type": "object",
19
+ "required": [
20
+ "id",
21
+ "rawcell_id",
22
+ "label"
23
+ ],
24
+ "properties": {
25
+ "id": {
26
+ "type": "integer"
27
+ },
28
+ "rawcell_id": {
29
+ "type": "integer"
30
+ },
31
+ "label": {
32
+ "type": "string"
33
+ }
34
+ }
35
+ }
36
+ },
37
+ "clusters": {
38
+ "type": "array",
39
+ "items": {
40
+ "type": "object",
41
+ "required": [
42
+ "model",
43
+ "type",
44
+ "bbox",
45
+ "cell_ids",
46
+ "merged",
47
+ "id"
48
+ ],
49
+ "properties": {
50
+ "model": {
51
+ "type": "string"
52
+ },
53
+ "type": {
54
+ "type": "string"
55
+ },
56
+ "bbox": {
57
+ "type": "array",
58
+ "minItems": 4,
59
+ "maxItems": 4,
60
+ "items": {
61
+ "type": "number"
62
+ }
63
+ },
64
+ "cell_ids": {
65
+ "type": "array",
66
+ "items": {
67
+ "type": "integer"
68
+ }
69
+ },
70
+ "merged": {
71
+ "type": "boolean"
72
+ },
73
+ "id": {
74
+ "type": "integer"
75
+ }
76
+ }
77
+ }
78
+ },
79
+ "tables": {
80
+ "type": "array",
81
+ "items": {
82
+ "type": "object",
83
+ "required": [
84
+ "cell_id",
85
+ "label",
86
+ "rows",
87
+ "cols"
88
+ ],
89
+ "properties": {
90
+ "cell_id": {
91
+ "type": "integer"
92
+ },
93
+ "label": {
94
+ "type": "string"
95
+ },
96
+ "rows": {
97
+ "type": "array",
98
+ "items": {
99
+ "type": "integer"
100
+ }
101
+ },
102
+ "cols": {
103
+ "type": "array",
104
+ "items": {
105
+ "type": "integer"
106
+ }
107
+ }
108
+ }
109
+ }
110
+ },
111
+ "source": {
112
+ "type": "object",
113
+ "required": [
114
+ "type",
115
+ "info",
116
+ "timestamp"
117
+ ],
118
+ "properties": {
119
+ "type": {
120
+ "type": "string"
121
+ },
122
+ "timestamp": {
123
+ "type": "number"
124
+ },
125
+ "info": {
126
+ "type": "object",
127
+ "required": [
128
+ "display_name",
129
+ "model_name",
130
+ "model_class",
131
+ "model_version",
132
+ "model_id"
133
+ ],
134
+ "properties": {
135
+ "display_name": {
136
+ "type": "string"
137
+ },
138
+ "model_name": {
139
+ "type": "string"
140
+ },
141
+ "model_class": {
142
+ "type": "string"
143
+ },
144
+ "model_version": {
145
+ "type": "string"
146
+ },
147
+ "model_id": {
148
+ "type": "string"
149
+ }
150
+ }
151
+ }
152
+ }
153
+ }
154
+ }
155
+ }
156
+ }
157
+ },
158
+ "properties": {
159
+ "annotations": {
160
+ "$ref": "#/definitions/annot+pred"
161
+ },
162
+ "predictions": {
163
+ "$ref": "#/definitions/annot+pred"
164
+ },
165
+ "reports": {
166
+ "type": "array"
167
+ }
168
+ },
169
+ "minProperties": 1,
170
+ "type": "object"
171
+ }
@@ -0,0 +1,300 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "type": "object",
4
+ "properties": {
5
+ "_type": {
6
+ "type": "string"
7
+ },
8
+ "bitmaps": {
9
+ "type": "array",
10
+ "items": {
11
+ "type": "object",
12
+ "properties": {
13
+ "bounding-box": {
14
+ "type": "object",
15
+ "properties": {
16
+ "max": {
17
+ "type": "array",
18
+ "items": {
19
+ "type": "number"
20
+ }
21
+ },
22
+ "min": {
23
+ "type": "array",
24
+ "items": {
25
+ "type": "number"
26
+ }
27
+ }
28
+ }
29
+ },
30
+ "image-id": {
31
+ "type": "string"
32
+ },
33
+ "prov": {
34
+ "type": "array",
35
+ "items": {
36
+ "type": "object",
37
+ "properties": {
38
+ "bbox": {
39
+ "type": "array",
40
+ "items": {
41
+ "type": "number"
42
+ }
43
+ },
44
+ "page": {
45
+ "type": "integer"
46
+ },
47
+ "span": {
48
+ "type": "array",
49
+ "items": {
50
+ "type": "integer"
51
+ }
52
+ }
53
+ }
54
+ }
55
+ },
56
+ "type": {
57
+ "type": "string"
58
+ }
59
+ }
60
+ }
61
+ },
62
+ "description": {
63
+ "type": "object",
64
+ "properties": {
65
+ "abstract": {
66
+ "type": "string"
67
+ },
68
+ "affiliations": {
69
+ "type": "string"
70
+ },
71
+ "authors": {
72
+ "type": "string"
73
+ },
74
+ "title": {
75
+ "type": "string"
76
+ }
77
+ }
78
+ },
79
+ "equations": {
80
+ "type": "array"
81
+ },
82
+ "figures": {
83
+ "type": "array",
84
+ "items": {
85
+ "type": "object",
86
+ "properties": {
87
+ "bounding-box": {
88
+ "type": "object",
89
+ "properties": {
90
+ "max": {
91
+ "type": "array",
92
+ "items": {
93
+ "type": "number"
94
+ }
95
+ },
96
+ "min": {
97
+ "type": "array",
98
+ "items": {
99
+ "type": "number"
100
+ }
101
+ }
102
+ }
103
+ },
104
+ "image-id": {
105
+ "type": "string"
106
+ },
107
+ "model": {
108
+ "type": "string"
109
+ },
110
+ "prov": {
111
+ "type": "array",
112
+ "items": {
113
+ "type": "object",
114
+ "properties": {
115
+ "bbox": {
116
+ "type": "array",
117
+ "items": {
118
+ "type": "number"
119
+ }
120
+ },
121
+ "page": {
122
+ "type": "integer"
123
+ },
124
+ "span": {
125
+ "type": "array",
126
+ "items": {
127
+ "type": "integer"
128
+ }
129
+ }
130
+ }
131
+ }
132
+ },
133
+ "type": {
134
+ "type": "string"
135
+ }
136
+ }
137
+ }
138
+ },
139
+ "file-info": {
140
+ "type": "object",
141
+ "properties": {
142
+ "#-pages": {
143
+ "type": "integer"
144
+ },
145
+ "document-hash": {
146
+ "type": "string"
147
+ },
148
+ "filename": {
149
+ "type": "string"
150
+ },
151
+ "page-hashes": {
152
+ "type": "array",
153
+ "items": {
154
+ "type": "object",
155
+ "properties": {
156
+ "hash": {
157
+ "type": "string"
158
+ },
159
+ "model": {
160
+ "type": "string"
161
+ },
162
+ "page": {
163
+ "type": "integer"
164
+ }
165
+ }
166
+ }
167
+ },
168
+ "description": {
169
+ "type": "object",
170
+ "properties": {
171
+ "keywords": {
172
+ "type": "string"
173
+ }
174
+ }
175
+ },
176
+ "collection-name": {
177
+ "type": "string"
178
+ }
179
+ }
180
+ },
181
+ "footnotes": {
182
+ "type": "array"
183
+ },
184
+ "main-text": {
185
+ "type": "array",
186
+ "items": {
187
+ "type": "object",
188
+ "properties": {
189
+ "font": {
190
+ "type": "string"
191
+ },
192
+ "name": {
193
+ "type": "string"
194
+ },
195
+ "prov": {
196
+ "type": "array",
197
+ "items": {
198
+ "type": "object",
199
+ "properties": {
200
+ "bbox": {
201
+ "type": "array",
202
+ "items": {
203
+ "type": "number"
204
+ }
205
+ },
206
+ "page": {
207
+ "type": "integer"
208
+ },
209
+ "span": {
210
+ "type": "array",
211
+ "items": {
212
+ "type": "integer"
213
+ }
214
+ }
215
+ }
216
+ }
217
+ },
218
+ "text": {
219
+ "type": "string"
220
+ },
221
+ "type": {
222
+ "type": "string"
223
+ }
224
+ }
225
+ }
226
+ },
227
+ "tables": {
228
+ "type": "array",
229
+ "items": {
230
+ "type": "object",
231
+ "properties": {
232
+ "#-cols": {
233
+ "type": "integer"
234
+ },
235
+ "#-rows": {
236
+ "type": "integer"
237
+ },
238
+ "bounding-box": {
239
+ "type": "object",
240
+ "properties": {
241
+ "max": {
242
+ "type": "array",
243
+ "items": {
244
+ "type": "number"
245
+ }
246
+ },
247
+ "min": {
248
+ "type": "array",
249
+ "items": {
250
+ "type": "number"
251
+ }
252
+ }
253
+ }
254
+ },
255
+ "data": {
256
+ "type": "array",
257
+ "items": {
258
+ "type": "array",
259
+ "items": {
260
+ "type": "string"
261
+ }
262
+ }
263
+ },
264
+ "model": {
265
+ "type": "string"
266
+ },
267
+ "prov": {
268
+ "type": "array",
269
+ "items": {
270
+ "type": "object",
271
+ "properties": {
272
+ "bbox": {
273
+ "type": "array",
274
+ "items": {
275
+ "type": "integer"
276
+ }
277
+ },
278
+ "page": {
279
+ "type": "integer"
280
+ },
281
+ "span": {
282
+ "type": "array",
283
+ "items": {
284
+ "type": "integer"
285
+ }
286
+ }
287
+ }
288
+ }
289
+ },
290
+ "text": {
291
+ "type": "string"
292
+ },
293
+ "type": {
294
+ "type": "string"
295
+ }
296
+ }
297
+ }
298
+ }
299
+ }
300
+ }
@@ -0,0 +1,166 @@
1
+ {
2
+ "$schema": "http://json-schema.org/schema#",
3
+ "type": "object",
4
+ "required": [
5
+ "_meta",
6
+ "info",
7
+ "dimension",
8
+ "words",
9
+ "cells",
10
+ "boxes",
11
+ "paths"
12
+ ],
13
+ "properties": {
14
+ "_meta": {
15
+ "type": "object",
16
+ "required": [
17
+ "page",
18
+ "coords-order",
19
+ "coords-origin"
20
+ ],
21
+ "properties": {
22
+ "page": {
23
+ "type": "object",
24
+ "required": [
25
+ "width",
26
+ "height"
27
+ ],
28
+ "properties": {
29
+ "width": {
30
+ "type": "number"
31
+ },
32
+ "height": {
33
+ "type": "number"
34
+ }
35
+ }
36
+ },
37
+ "coords-order": {
38
+ "type": "array",
39
+ "items": {
40
+ "type": "string"
41
+ }
42
+ },
43
+ "coords-origin": {
44
+ "type": "string"
45
+ }
46
+ }
47
+ },
48
+ "info": {
49
+ "type": "object"
50
+ },
51
+ "dimension": {
52
+ "type": "object",
53
+ "required": [
54
+ "width",
55
+ "height"
56
+ ],
57
+ "properties": {
58
+ "width": {
59
+ "type": "number"
60
+ },
61
+ "height": {
62
+ "type": "number"
63
+ }
64
+ }
65
+ },
66
+ "words": {
67
+ "type": "array",
68
+ "items": {
69
+ "type": "object",
70
+ "required": [
71
+ "confidence",
72
+ "bbox",
73
+ "content"
74
+ ],
75
+ "properties": {
76
+ "confidence": {
77
+ "type": "number"
78
+ },
79
+ "bbox": {
80
+ "type": "array",
81
+ "item": {
82
+ "type": "number"
83
+ }
84
+ },
85
+ "content": {
86
+ "type": "string"
87
+ }
88
+ }
89
+ }
90
+ },
91
+ "cells": {
92
+ "type": "array",
93
+ "items": {
94
+ "type": "object",
95
+ "required": [
96
+ "confidence",
97
+ "bbox",
98
+ "content"
99
+ ],
100
+ "properties": {
101
+ "confidence": {
102
+ "type": "number"
103
+ },
104
+ "bbox": {
105
+ "type": "array",
106
+ "item": {
107
+ "type": "number"
108
+ }
109
+ },
110
+ "content": {
111
+ "type": "string"
112
+ }
113
+ }
114
+ }
115
+ },
116
+ "boxes": {
117
+ "type": "array",
118
+ "items": {
119
+ "type": "object",
120
+ "required": [
121
+ "confidence",
122
+ "bbox",
123
+ "content"
124
+ ],
125
+ "properties": {
126
+ "confidence": {
127
+ "type": "number"
128
+ },
129
+ "bbox": {
130
+ "type": "array",
131
+ "item": {
132
+ "type": "number"
133
+ }
134
+ },
135
+ "content": {
136
+ "type": "string"
137
+ }
138
+ }
139
+ }
140
+ },
141
+ "paths": {
142
+ "type": "array",
143
+ "items": {
144
+ "type": "object",
145
+ "required": [
146
+ "x",
147
+ "y"
148
+ ],
149
+ "properties": {
150
+ "x": {
151
+ "type": "array",
152
+ "items": {
153
+ "type": "number"
154
+ }
155
+ },
156
+ "y": {
157
+ "type": "array",
158
+ "items": {
159
+ "type": "number"
160
+ }
161
+ }
162
+ }
163
+ }
164
+ }
165
+ }
166
+ }