@exulu/backend 1.46.1 → 1.47.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. package/.agents/skills/mintlify/SKILL.md +347 -0
  2. package/.editorconfig +15 -0
  3. package/.eslintrc.json +52 -0
  4. package/.jscpd.json +18 -0
  5. package/.prettierignore +5 -0
  6. package/.prettierrc.json +12 -0
  7. package/CHANGELOG.md +15 -2
  8. package/README.md +747 -0
  9. package/SECURITY.md +5 -0
  10. package/dist/index.cjs +12015 -10496
  11. package/dist/index.d.cts +725 -667
  12. package/dist/index.d.ts +725 -667
  13. package/dist/index.js +12034 -10508
  14. package/ee/LICENSE.md +62 -0
  15. package/ee/agentic-retrieval/index.ts +1109 -0
  16. package/ee/documents/THIRD_PARTY_LICENSES/docling.txt +31 -0
  17. package/ee/documents/processing/build_pdf_processor.sh +35 -0
  18. package/ee/documents/processing/chunk_markdown.py +263 -0
  19. package/ee/documents/processing/doc_processor.ts +635 -0
  20. package/ee/documents/processing/pdf_processor.spec +115 -0
  21. package/ee/documents/processing/pdf_to_markdown.py +420 -0
  22. package/ee/documents/processing/requirements.txt +4 -0
  23. package/ee/entitlements.ts +49 -0
  24. package/ee/markdown.ts +686 -0
  25. package/ee/queues/decorator.ts +140 -0
  26. package/ee/queues/queues.ts +156 -0
  27. package/ee/queues/server.ts +6 -0
  28. package/ee/rbac-resolver.ts +51 -0
  29. package/ee/rbac-update.ts +111 -0
  30. package/ee/schemas.ts +347 -0
  31. package/ee/tokenizer.ts +80 -0
  32. package/ee/workers.ts +1423 -0
  33. package/eslint.config.js +88 -0
  34. package/jest.config.ts +25 -0
  35. package/license.md +73 -49
  36. package/mintlify-docs/.mintignore +7 -0
  37. package/mintlify-docs/AGENTS.md +33 -0
  38. package/mintlify-docs/CLAUDE.MD +50 -0
  39. package/mintlify-docs/CONTRIBUTING.md +32 -0
  40. package/mintlify-docs/LICENSE +21 -0
  41. package/mintlify-docs/README.md +55 -0
  42. package/mintlify-docs/ai-tools/claude-code.mdx +43 -0
  43. package/mintlify-docs/ai-tools/cursor.mdx +39 -0
  44. package/mintlify-docs/ai-tools/windsurf.mdx +39 -0
  45. package/mintlify-docs/api-reference/core-types/agent-types.mdx +110 -0
  46. package/mintlify-docs/api-reference/core-types/analytics-types.mdx +95 -0
  47. package/mintlify-docs/api-reference/core-types/configuration-types.mdx +83 -0
  48. package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +106 -0
  49. package/mintlify-docs/api-reference/core-types/job-types.mdx +135 -0
  50. package/mintlify-docs/api-reference/core-types/overview.mdx +73 -0
  51. package/mintlify-docs/api-reference/core-types/prompt-types.mdx +102 -0
  52. package/mintlify-docs/api-reference/core-types/rbac-types.mdx +163 -0
  53. package/mintlify-docs/api-reference/core-types/session-types.mdx +77 -0
  54. package/mintlify-docs/api-reference/core-types/user-management.mdx +112 -0
  55. package/mintlify-docs/api-reference/core-types/workflow-types.mdx +88 -0
  56. package/mintlify-docs/api-reference/core-types.mdx +585 -0
  57. package/mintlify-docs/api-reference/dynamic-types.mdx +851 -0
  58. package/mintlify-docs/api-reference/endpoint/create.mdx +4 -0
  59. package/mintlify-docs/api-reference/endpoint/delete.mdx +4 -0
  60. package/mintlify-docs/api-reference/endpoint/get.mdx +4 -0
  61. package/mintlify-docs/api-reference/endpoint/webhook.mdx +4 -0
  62. package/mintlify-docs/api-reference/introduction.mdx +661 -0
  63. package/mintlify-docs/api-reference/mutations.mdx +1012 -0
  64. package/mintlify-docs/api-reference/openapi.json +217 -0
  65. package/mintlify-docs/api-reference/queries.mdx +1154 -0
  66. package/mintlify-docs/backend/introduction.mdx +218 -0
  67. package/mintlify-docs/changelog.mdx +293 -0
  68. package/mintlify-docs/community-edition.mdx +304 -0
  69. package/mintlify-docs/core/exulu-agent/api-reference.mdx +894 -0
  70. package/mintlify-docs/core/exulu-agent/configuration.mdx +690 -0
  71. package/mintlify-docs/core/exulu-agent/introduction.mdx +552 -0
  72. package/mintlify-docs/core/exulu-app/api-reference.mdx +481 -0
  73. package/mintlify-docs/core/exulu-app/configuration.mdx +319 -0
  74. package/mintlify-docs/core/exulu-app/introduction.mdx +117 -0
  75. package/mintlify-docs/core/exulu-authentication.mdx +810 -0
  76. package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +1011 -0
  77. package/mintlify-docs/core/exulu-chunkers/configuration.mdx +596 -0
  78. package/mintlify-docs/core/exulu-chunkers/introduction.mdx +403 -0
  79. package/mintlify-docs/core/exulu-context/api-reference.mdx +911 -0
  80. package/mintlify-docs/core/exulu-context/configuration.mdx +648 -0
  81. package/mintlify-docs/core/exulu-context/introduction.mdx +394 -0
  82. package/mintlify-docs/core/exulu-database.mdx +811 -0
  83. package/mintlify-docs/core/exulu-default-agents.mdx +545 -0
  84. package/mintlify-docs/core/exulu-eval/api-reference.mdx +772 -0
  85. package/mintlify-docs/core/exulu-eval/configuration.mdx +680 -0
  86. package/mintlify-docs/core/exulu-eval/introduction.mdx +459 -0
  87. package/mintlify-docs/core/exulu-logging.mdx +464 -0
  88. package/mintlify-docs/core/exulu-otel.mdx +670 -0
  89. package/mintlify-docs/core/exulu-queues/api-reference.mdx +648 -0
  90. package/mintlify-docs/core/exulu-queues/configuration.mdx +650 -0
  91. package/mintlify-docs/core/exulu-queues/introduction.mdx +474 -0
  92. package/mintlify-docs/core/exulu-reranker/api-reference.mdx +630 -0
  93. package/mintlify-docs/core/exulu-reranker/configuration.mdx +663 -0
  94. package/mintlify-docs/core/exulu-reranker/introduction.mdx +516 -0
  95. package/mintlify-docs/core/exulu-tool/api-reference.mdx +723 -0
  96. package/mintlify-docs/core/exulu-tool/configuration.mdx +805 -0
  97. package/mintlify-docs/core/exulu-tool/introduction.mdx +539 -0
  98. package/mintlify-docs/core/exulu-variables/api-reference.mdx +699 -0
  99. package/mintlify-docs/core/exulu-variables/configuration.mdx +736 -0
  100. package/mintlify-docs/core/exulu-variables/introduction.mdx +511 -0
  101. package/mintlify-docs/development.mdx +94 -0
  102. package/mintlify-docs/docs.json +248 -0
  103. package/mintlify-docs/enterprise-edition.mdx +538 -0
  104. package/mintlify-docs/essentials/code.mdx +35 -0
  105. package/mintlify-docs/essentials/images.mdx +59 -0
  106. package/mintlify-docs/essentials/markdown.mdx +88 -0
  107. package/mintlify-docs/essentials/navigation.mdx +87 -0
  108. package/mintlify-docs/essentials/reusable-snippets.mdx +110 -0
  109. package/mintlify-docs/essentials/settings.mdx +318 -0
  110. package/mintlify-docs/favicon.svg +3 -0
  111. package/mintlify-docs/frontend/introduction.mdx +39 -0
  112. package/mintlify-docs/getting-started.mdx +267 -0
  113. package/mintlify-docs/guides/custom-agent.mdx +608 -0
  114. package/mintlify-docs/guides/first-agent.mdx +315 -0
  115. package/mintlify-docs/images/admin_ui.png +0 -0
  116. package/mintlify-docs/images/contexts.png +0 -0
  117. package/mintlify-docs/images/create_agents.png +0 -0
  118. package/mintlify-docs/images/evals.png +0 -0
  119. package/mintlify-docs/images/graphql.png +0 -0
  120. package/mintlify-docs/images/graphql_api.png +0 -0
  121. package/mintlify-docs/images/hero-dark.png +0 -0
  122. package/mintlify-docs/images/hero-light.png +0 -0
  123. package/mintlify-docs/images/hero.png +0 -0
  124. package/mintlify-docs/images/knowledge_sources.png +0 -0
  125. package/mintlify-docs/images/mcp.png +0 -0
  126. package/mintlify-docs/images/scaling.png +0 -0
  127. package/mintlify-docs/index.mdx +411 -0
  128. package/mintlify-docs/logo/dark.svg +9 -0
  129. package/mintlify-docs/logo/light.svg +9 -0
  130. package/mintlify-docs/partners.mdx +558 -0
  131. package/mintlify-docs/products.mdx +77 -0
  132. package/mintlify-docs/snippets/snippet-intro.mdx +4 -0
  133. package/mintlify-docs/styles.css +207 -0
  134. package/{documentation → old-documentation}/logging.md +3 -3
  135. package/package.json +35 -4
  136. package/skills-lock.json +10 -0
  137. package/types/context-processor.ts +45 -0
  138. package/types/exulu-table-definition.ts +79 -0
  139. package/types/file-types.ts +18 -0
  140. package/types/models/agent.ts +10 -12
  141. package/types/models/exulu-agent-tool-config.ts +11 -0
  142. package/types/models/rate-limiter-rules.ts +7 -0
  143. package/types/provider-config.ts +21 -0
  144. package/types/queue-config.ts +16 -0
  145. package/types/rbac-rights-modes.ts +1 -0
  146. package/types/statistics.ts +20 -0
  147. package/types/workflow.ts +31 -0
  148. package/changelogs/10.11.2025_03.12.2025.md +0 -316
  149. package/types/models/agent-backend.ts +0 -15
  150. /package/{documentation → old-documentation}/otel.md +0 -0
  151. /package/{documentation → old-documentation}/patch-older-releases.md +0 -0
@@ -0,0 +1,115 @@
1
+ # -*- mode: python ; coding: utf-8 -*-
2
+ from PyInstaller.utils.hooks import copy_metadata, collect_data_files
3
+ import os
4
+
5
+ block_cipher = None
6
+
7
+ # Collect package metadata for packages that need it
8
+ datas = []
9
+ datas += copy_metadata('docling')
10
+ datas += copy_metadata('docling-core')
11
+ datas += copy_metadata('docling-parse')
12
+ datas += copy_metadata('docling-ibm-models')
13
+ datas += copy_metadata('transformers')
14
+ datas += copy_metadata('torch')
15
+ datas += copy_metadata('tokenizers')
16
+ datas += copy_metadata('huggingface-hub')
17
+ datas += copy_metadata('pydantic')
18
+ datas += copy_metadata('pydantic-core')
19
+
20
+ # Collect data files from docling packages
21
+ datas += collect_data_files('docling_parse')
22
+ datas += collect_data_files('docling')
23
+ datas += collect_data_files('docling_core')
24
+ datas += collect_data_files('docling_ibm_models')
25
+ datas += collect_data_files('transformers')
26
+
27
+ # Collect all data files from docling and transformers packages
28
+ a = Analysis(
29
+ ['pdf_processor.py'],
30
+ pathex=[],
31
+ binaries=[],
32
+ datas=datas,
33
+ hiddenimports=[
34
+ 'docling',
35
+ 'docling.document_converter',
36
+ 'docling.chunking',
37
+ 'docling.models',
38
+ 'docling.models.plugins',
39
+ 'docling.models.plugins.defaults',
40
+ 'docling.backend',
41
+ 'docling.backend.docling_parse_backend',
42
+ 'docling.backend.asciidoc_backend',
43
+ 'docling.backend.html_backend',
44
+ 'docling.backend.md_backend',
45
+ 'docling.backend.msexcel_backend',
46
+ 'docling.backend.mspowerpoint_backend',
47
+ 'docling.backend.msword_backend',
48
+ 'docling.datamodel',
49
+ 'docling.datamodel.document',
50
+ 'docling_core',
51
+ 'docling_core.transforms.chunker',
52
+ 'docling_core.transforms.chunker.tokenizer',
53
+ 'docling_core.transforms.chunker.tokenizer.huggingface',
54
+ 'transformers',
55
+ 'transformers.models',
56
+ 'transformers.models.auto',
57
+ 'torch',
58
+ 'numpy',
59
+ 'PIL',
60
+ 'pdfplumber',
61
+ 'pypdf',
62
+ 'pikepdf',
63
+ 'lxml',
64
+ 'bs4',
65
+ 'tiktoken',
66
+ 'tokenizers',
67
+ 'sentencepiece',
68
+ 'safetensors',
69
+ 'huggingface_hub',
70
+ 'tqdm',
71
+ 'regex',
72
+ 'requests',
73
+ 'urllib3',
74
+ 'certifi',
75
+ 'charset_normalizer',
76
+ 'idna',
77
+ 'packaging',
78
+ 'filelock',
79
+ 'pyyaml',
80
+ 'jinja2',
81
+ 'markupsafe',
82
+ ],
83
+ hookspath=[],
84
+ hooksconfig={},
85
+ runtime_hooks=[],
86
+ excludes=[],
87
+ win_no_prefer_redirects=False,
88
+ win_private_assemblies=False,
89
+ cipher=block_cipher,
90
+ noarchive=False,
91
+ )
92
+
93
+ pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
94
+
95
+ exe = EXE(
96
+ pyz,
97
+ a.scripts,
98
+ a.binaries,
99
+ a.zipfiles,
100
+ a.datas,
101
+ [],
102
+ name='pdf_processor',
103
+ debug=False,
104
+ bootloader_ignore_signals=False,
105
+ strip=False,
106
+ upx=True,
107
+ upx_exclude=[],
108
+ runtime_tmpdir=None,
109
+ console=True,
110
+ disable_windowed_traceback=False,
111
+ argv_emulation=False,
112
+ target_arch=None,
113
+ codesign_identity=None,
114
+ entitlements_file=None,
115
+ )
@@ -0,0 +1,420 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF to Markdown Converter using Docling
4
+ Converts a PDF to JSON with page-separated markdown and images.
5
+
6
+ Usage:
7
+ pdf_to_markdown.py <pdf_file_path> [-o OUTPUT_PATH] [--max-tokens MAX_TOKENS]
8
+ """
9
+
10
+ import sys
11
+ import os
12
+ import warnings
13
+ import argparse
14
+ import json
15
+ from pathlib import Path
16
+ from PIL import Image
17
+
18
+ # Suppress warnings
19
+ warnings.filterwarnings('ignore')
20
+ os.environ['PYTHONWARNINGS'] = 'ignore'
21
+
22
+ from docling.document_converter import DocumentConverter, PdfFormatOption
23
+ from docling.datamodel.base_models import InputFormat
24
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
25
+ from hierarchical.postprocessor import ResultPostprocessor
26
+
27
+ IMAGE_RESOLUTION_SCALE = 2.0
28
+
29
+ def normalize_markdown_content(content: str) -> str:
30
+ """
31
+ Normalize markdown content by removing excessive whitespace,
32
+ especially in table formatting.
33
+
34
+ Args:
35
+ content: Raw markdown content
36
+
37
+ Returns:
38
+ Normalized markdown content
39
+ """
40
+ import re
41
+
42
+ lines = content.split('\n')
43
+ normalized_lines = []
44
+
45
+ for line in lines:
46
+ # Check if this is a table row (contains |)
47
+ if '|' in line:
48
+ # Split by | and strip whitespace from each cell
49
+ parts = line.split('|')
50
+ cleaned_parts = [part.strip() for part in parts]
51
+ # Rejoin with single space padding
52
+ normalized_line = ' | '.join(cleaned_parts)
53
+ normalized_lines.append(normalized_line)
54
+ else:
55
+ # For non-table lines, just strip trailing whitespace
56
+ normalized_lines.append(line.rstrip())
57
+
58
+ return '\n'.join(normalized_lines)
59
+
60
+
61
+ def extract_headings_from_markdown(markdown_content: str) -> list:
62
+ """
63
+ Extract all headings from markdown content as a list of (level, text) tuples.
64
+
65
+ Args:
66
+ markdown_content: Markdown text content
67
+
68
+ Returns:
69
+ List of (level, text) tuples in order of appearance
70
+ """
71
+ import re
72
+
73
+ headings = []
74
+ lines = markdown_content.split('\n')
75
+
76
+ for line in lines:
77
+ # Match markdown headings (# Header)
78
+ heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
79
+ if heading_match:
80
+ level = len(heading_match.group(1)) # Number of # symbols
81
+ text = heading_match.group(2).strip()
82
+ headings.append((level, text))
83
+
84
+ return headings
85
+
86
+
87
+ def build_hierarchy_from_stack(heading_stack: list) -> dict:
88
+ """
89
+ Build a nested hierarchy dictionary from a heading stack.
90
+
91
+ Args:
92
+ heading_stack: List of (level, text) tuples representing the current path
93
+
94
+ Returns:
95
+ Nested dictionary representing the hierarchy
96
+ """
97
+ hierarchy = {}
98
+ current = hierarchy
99
+
100
+ for i, (level, heading_text) in enumerate(heading_stack):
101
+ if i == len(heading_stack) - 1:
102
+ # Last item in stack - set to null
103
+ current[heading_text] = None
104
+ else:
105
+ # Not last item - create dict for children
106
+ if heading_text not in current:
107
+ current[heading_text] = {}
108
+ current = current[heading_text]
109
+
110
+ return hierarchy
111
+
112
+
113
+ def merge_hierarchies(h1: dict, h2: dict) -> dict:
114
+ """
115
+ Deep merge two hierarchy dictionaries, combining their structures.
116
+ """
117
+ if not h1:
118
+ return h2.copy() if h2 else {}
119
+ if not h2:
120
+ return h1.copy()
121
+
122
+ result = {}
123
+ all_keys = set(h1.keys()) | set(h2.keys())
124
+
125
+ for key in all_keys:
126
+ if key in h1 and key in h2:
127
+ # Both have this key
128
+ if isinstance(h1[key], dict) and isinstance(h2[key], dict):
129
+ result[key] = merge_hierarchies(h1[key], h2[key])
130
+ elif h2[key] is not None:
131
+ result[key] = h2[key]
132
+ else:
133
+ result[key] = h1[key]
134
+ elif key in h1:
135
+ result[key] = h1[key]
136
+ else:
137
+ result[key] = h2[key]
138
+
139
+ return result
140
+
141
+
142
+ def parse_heading_hierarchy(markdown_content: str) -> dict:
143
+ """
144
+ Parse markdown content and build a nested heading hierarchy.
145
+ Headings at the same level are siblings in the hierarchy.
146
+
147
+ Args:
148
+ markdown_content: Markdown text content
149
+
150
+ Returns:
151
+ Nested dictionary representing heading hierarchy
152
+ """
153
+ import re
154
+
155
+ lines = markdown_content.split('\n')
156
+ heading_stack = [] # Stack of (level, text) tuples
157
+ hierarchy = {}
158
+
159
+ for line in lines:
160
+ # Match markdown headings (# Header)
161
+ heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
162
+ if heading_match:
163
+ level = len(heading_match.group(1)) # Number of # symbols
164
+ text = heading_match.group(2).strip()
165
+
166
+ # Pop headings from stack that are deeper than current level
167
+ # (removes children when moving back up the hierarchy)
168
+ while heading_stack and heading_stack[-1][0] >= level:
169
+ heading_stack.pop()
170
+
171
+ # Add this heading to stack
172
+ heading_stack.append((level, text))
173
+
174
+ # Build nested structure for current heading path
175
+ current = hierarchy
176
+ for i, (lvl, heading_text) in enumerate(heading_stack):
177
+ if heading_text not in current:
178
+ # If this is the last heading in the stack, set to null
179
+ # Otherwise, set to empty dict for children
180
+ if i == len(heading_stack) - 1:
181
+ current[heading_text] = None
182
+ else:
183
+ current[heading_text] = {}
184
+
185
+ # Navigate to the next level if not at the end
186
+ if i < len(heading_stack) - 1:
187
+ if current[heading_text] is None:
188
+ current[heading_text] = {}
189
+ current = current[heading_text]
190
+
191
+ return hierarchy
192
+
193
+
194
+ def process_pdf_to_json(pdf_path: str, output_path: str = None, images_dir: str = None) -> list:
195
+ """
196
+ Process a PDF file using Docling and return JSON with page-separated markdown and images.
197
+
198
+ Args:
199
+ pdf_path: Path to the PDF file
200
+ output_path: Optional output path for JSON file
201
+ images_dir: Directory to save page images (should be passed from main)
202
+
203
+ Returns:
204
+ List of page objects with content and image references
205
+ """
206
+ # Configure PDF pipeline with image generation
207
+ pipeline_options = PdfPipelineOptions()
208
+ pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
209
+ pipeline_options.generate_page_images = True # Generate page images
210
+
211
+ # Convert the PDF document
212
+ converter = DocumentConverter(
213
+ format_options={
214
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
215
+ }
216
+ )
217
+ result = converter.convert(source=pdf_path)
218
+
219
+ # Apply hierarchical post-processing to fix heading hierarchy
220
+ print(f"Applying hierarchical post-processing...", file=sys.stderr)
221
+ ResultPostprocessor(result, source=pdf_path).process()
222
+
223
+ doc = result.document
224
+
225
+ # Export full markdown with page markers
226
+ full_markdown = doc.export_to_markdown(page_break_placeholder="<!-- END_OF_PAGE -->")
227
+
228
+ # Split by page markers
229
+ pages = full_markdown.split("<!-- END_OF_PAGE -->")
230
+
231
+ # Ensure images_dir is a Path object
232
+ images_dir = Path(images_dir)
233
+ images_dir.mkdir(exist_ok=True)
234
+
235
+ # Extract and save page images from the conversion result
236
+ page_images = {}
237
+
238
+ # Check if page images are in the result object
239
+ if hasattr(result, 'pages') and result.pages:
240
+ # Create images directory if it doesn't exist
241
+ images_dir.mkdir(exist_ok=True)
242
+
243
+ for page_data in result.pages:
244
+ # Get page number
245
+ page_no = getattr(page_data, 'page_no', None) or getattr(page_data, 'page_number', None)
246
+
247
+ # Check for image attribute
248
+ if hasattr(page_data, 'image') and page_data.image:
249
+ # Save the PIL image to disk
250
+ image_filename = f"page_{page_no}.png"
251
+ image_path = images_dir / image_filename
252
+
253
+ # Save the image
254
+ if isinstance(page_data.image, Image.Image):
255
+ page_data.image.save(str(image_path), 'PNG')
256
+ page_images[page_no] = str(image_path)
257
+ print(f"Saved page {page_no} image to: {image_path}", file=sys.stderr)
258
+
259
+ # Build page objects with cumulative heading hierarchy
260
+ page_objects = []
261
+ cumulative_markdown = "" # Track all markdown up to and including current page
262
+ heading_stack = [] # Current heading context (stack of (level, text) tuples)
263
+
264
+ # Build JSON structure with page-separated content
265
+ for page_num, page_content in enumerate(pages, start=1):
266
+ # Skip empty pages
267
+ if not page_content.strip():
268
+ continue
269
+
270
+ # Add current page to cumulative markdown
271
+ cumulative_markdown += page_content + "\n"
272
+
273
+ # Extract headings from current page only
274
+ page_headings = extract_headings_from_markdown(page_content)
275
+
276
+ # Track all heading contexts that appear on this page
277
+ page_hierarchy = {}
278
+
279
+ # If no headings on this page, use the current stack context
280
+ if not page_headings:
281
+ if heading_stack:
282
+ page_hierarchy = build_hierarchy_from_stack(heading_stack)
283
+ else:
284
+ # Process each heading on the current page
285
+ for level, text in page_headings:
286
+ # Pop headings from stack that are at same or deeper level
287
+ while heading_stack and heading_stack[-1][0] >= level:
288
+ heading_stack.pop()
289
+
290
+ # Add this heading to stack
291
+ heading_stack.append((level, text))
292
+
293
+ # Build hierarchy for this context and merge it
294
+ context_hierarchy = build_hierarchy_from_stack(heading_stack)
295
+ page_hierarchy = merge_hierarchies(page_hierarchy, context_hierarchy)
296
+
297
+ # Get image path if available
298
+ page_image_path = page_images.get(page_num)
299
+
300
+ # Normalize the content to remove excessive whitespace
301
+ normalized_content = normalize_markdown_content(page_content.strip())
302
+
303
+ page_objects.append({
304
+ "page": page_num,
305
+ "content": normalized_content,
306
+ "image": page_image_path,
307
+ "headings": page_hierarchy
308
+ })
309
+
310
+ # Save to JSON file if output path provided
311
+ if output_path:
312
+ with open(output_path, 'w', encoding='utf-8') as f:
313
+ json.dump(page_objects, f, indent=2, ensure_ascii=False)
314
+ f.flush()
315
+ print(f"Successfully saved JSON to: {output_path}", file=sys.stderr)
316
+ print(f"Images saved to: {images_dir}", file=sys.stderr)
317
+
318
+ return page_objects
319
+
320
+
321
+ def main():
322
+ """Main entry point for the script."""
323
+ # Set up argument parser
324
+ parser = argparse.ArgumentParser(
325
+ description='Convert PDF to Markdown using Docling with hierarchical headings and page markers',
326
+ formatter_class=argparse.RawDescriptionHelpFormatter
327
+ )
328
+
329
+ parser.add_argument(
330
+ 'pdf_path',
331
+ type=str,
332
+ help='Path to the PDF file to convert'
333
+ )
334
+
335
+ parser.add_argument(
336
+ '-o', '--output',
337
+ type=str,
338
+ dest='output_path',
339
+ help='Output path for the JSON file (default: same name as PDF with .json extension)'
340
+ )
341
+
342
+ parser.add_argument(
343
+ '--images-dir',
344
+ type=str,
345
+ dest='images_dir',
346
+ help='Directory to save page images (default: <pdf_name>_images/)'
347
+ )
348
+
349
+ parser.add_argument(
350
+ '--max-tokens',
351
+ type=int,
352
+ dest='max_tokens',
353
+ help='Maximum number of tokens (currently not used, reserved for future use)'
354
+ )
355
+
356
+ # Parse arguments
357
+ args = parser.parse_args()
358
+
359
+ pdf_path = args.pdf_path
360
+ output_path = args.output_path
361
+ images_dir = args.images_dir
362
+
363
+ # Validate the file exists
364
+ if not Path(pdf_path).exists():
365
+ print(f"Error: File not found: {pdf_path}", file=sys.stderr)
366
+ sys.exit(1)
367
+
368
+ # Create a shared folder named after the source file
369
+ pdf_file = Path(pdf_path)
370
+ shared_folder = pdf_file.parent / pdf_file.stem
371
+ shared_folder.mkdir(exist_ok=True)
372
+
373
+ # Default: JSON file inside the shared folder
374
+ if not output_path:
375
+ output_path = str(shared_folder / "processed.json")
376
+ else:
377
+ # If output_path is a directory, append docling.json
378
+ output_path_obj = Path(output_path)
379
+ if output_path_obj.is_dir():
380
+ output_path = str(output_path_obj / "processed.json")
381
+ elif not output_path_obj.suffix:
382
+ # If no extension provided, treat as directory
383
+ output_path_obj.mkdir(exist_ok=True)
384
+ output_path = str(output_path_obj / "processed.json")
385
+
386
+ # Default: images directory inside the shared folder
387
+ if not images_dir:
388
+ # If output_path was provided and is in a custom location, use that location's parent
389
+ output_parent = Path(output_path).parent
390
+ images_dir = str(output_parent / "images")
391
+
392
+ try:
393
+ # Process the PDF
394
+ print(f"Processing PDF: {pdf_path}", file=sys.stderr)
395
+ page_objects = process_pdf_to_json(pdf_path, output_path, images_dir)
396
+
397
+ # Print stats
398
+ total_content_length = sum(len(page['content']) for page in page_objects)
399
+ images_with_content = sum(1 for page in page_objects if page.get('image'))
400
+
401
+ print(f"\nJSON output stats:", file=sys.stderr)
402
+ print(f" Total pages: {len(page_objects)}", file=sys.stderr)
403
+ print(f" Pages with images: {images_with_content}", file=sys.stderr)
404
+ print(f" Total content characters: {total_content_length}", file=sys.stderr)
405
+
406
+ # Exit cleanly
407
+ sys.stderr.flush()
408
+ sys.stdout.flush()
409
+ os._exit(0)
410
+
411
+ except Exception as e:
412
+ print(f"Error processing PDF: {str(e)}", file=sys.stderr)
413
+ import traceback
414
+ traceback.print_exc(file=sys.stderr)
415
+ sys.stderr.flush()
416
+ os._exit(1)
417
+
418
+
419
+ if __name__ == "__main__":
420
+ main()
@@ -0,0 +1,4 @@
1
+ docling
2
+ transformers
3
+ pyinstaller
4
+ docling-hierarchical-pdf
@@ -0,0 +1,49 @@
1
+ export const ENTITLEMENTS: {
2
+ "rbac": boolean,
3
+ "advanced-markdown-chunker": boolean,
4
+ "agentic-retrieval": boolean,
5
+ "queues": boolean,
6
+ "custom-branding": boolean,
7
+ "evals": boolean,
8
+ "template-conversations": boolean,
9
+ "agent-feedback": boolean,
10
+ "multi-agent-tooling": boolean,
11
+ "advanced-document-processing": boolean
12
+ } = {
13
+ "rbac": false,
14
+ "advanced-markdown-chunker": false,
15
+ "agentic-retrieval": false,
16
+ "queues": false,
17
+ "custom-branding": false,
18
+ "evals": false,
19
+ "template-conversations": false,
20
+ "agent-feedback": false,
21
+ "multi-agent-tooling": false,
22
+ "advanced-document-processing": false
23
+ }
24
+
25
+ export const checkLicense = () => {
26
+ if (
27
+ !process.env.EXULU_ENTERPRISE_LICENSE ||
28
+ process.env.EXULU_ENTERPRISE_LICENSE === "" ||
29
+ !process.env.EXULU_ENTERPRISE_LICENSE.startsWith("EXULU_EE_")
30
+ ) {
31
+ return ENTITLEMENTS
32
+ } else {
33
+ return {
34
+ "rbac": true,
35
+ "advanced-markdown-chunker": true,
36
+ "agentic-retrieval": true,
37
+ "mcp": true,
38
+ "queues": true,
39
+ "prompt-library": true,
40
+ "custom-branding": true,
41
+ "evals": true,
42
+ "template-conversations": true,
43
+ "agent-feedback": true,
44
+ "multi-agent-tooling": true,
45
+ "advanced-document-processing": true
46
+ }
47
+ }
48
+
49
+ }