@redpanda-data/docs-extensions-and-macros 4.11.0 → 4.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/bin/doc-tools.js +4 -2
  2. package/extensions/convert-to-markdown.js +17 -1
  3. package/package.json +3 -1
  4. package/tools/property-extractor/COMPUTED_CONSTANTS.md +173 -0
  5. package/tools/property-extractor/Makefile +12 -1
  6. package/tools/property-extractor/README.adoc +828 -97
  7. package/tools/property-extractor/compare-properties.js +38 -13
  8. package/tools/property-extractor/constant_resolver.py +610 -0
  9. package/tools/property-extractor/file_pair.py +42 -0
  10. package/tools/property-extractor/generate-handlebars-docs.js +41 -8
  11. package/tools/property-extractor/helpers/gt.js +9 -0
  12. package/tools/property-extractor/helpers/includes.js +17 -0
  13. package/tools/property-extractor/helpers/index.js +3 -0
  14. package/tools/property-extractor/helpers/isEnterpriseEnum.js +24 -0
  15. package/tools/property-extractor/helpers/renderPropertyExample.js +6 -5
  16. package/tools/property-extractor/overrides.json +248 -0
  17. package/tools/property-extractor/parser.py +254 -32
  18. package/tools/property-extractor/property_bag.py +40 -0
  19. package/tools/property-extractor/property_extractor.py +1417 -430
  20. package/tools/property-extractor/requirements.txt +1 -0
  21. package/tools/property-extractor/templates/property-backup.hbs +161 -0
  22. package/tools/property-extractor/templates/property.hbs +104 -49
  23. package/tools/property-extractor/templates/topic-property-backup.hbs +148 -0
  24. package/tools/property-extractor/templates/topic-property.hbs +72 -34
  25. package/tools/property-extractor/tests/test_known_values.py +617 -0
  26. package/tools/property-extractor/tests/transformers_test.py +81 -6
  27. package/tools/property-extractor/topic_property_extractor.py +23 -10
  28. package/tools/property-extractor/transformers.py +2191 -369
  29. package/tools/property-extractor/type_definition_extractor.py +669 -0
  30. package/tools/property-extractor/definitions.json +0 -245
@@ -65,12 +65,308 @@ from copy import deepcopy
65
65
  from pathlib import Path
66
66
  from file_pair import FilePair
67
67
  from tree_sitter import Language, Parser
68
+ import operator
69
+ import re
68
70
 
69
71
  from parser import build_treesitter_cpp_library, extract_properties_from_file_pair
70
72
  from property_bag import PropertyBag
71
73
  from transformers import *
74
+ from constant_resolver import ConstantResolver
75
+
76
+ # Compiled regex patterns for performance optimization
77
+ VECTOR_PATTERN = re.compile(r'std::vector<[^>]+>\s*\{\s*([^}]*)\s*\}')
78
+ ENUM_PATTERN = re.compile(r'^[a-zA-Z0-9_:]+::([a-zA-Z0-9_]+)$') # Match full qualified identifier, not followed by constructors
79
+ CONSTRUCTOR_PATTERN = re.compile(r'([a-zA-Z0-9_:]+)\((.*)\)')
80
+ BRACED_CONSTRUCTOR_PATTERN = re.compile(r'([a-zA-Z0-9_:]+)\{(.*)\}')
81
+ DIGIT_SEPARATOR_PATTERN = re.compile(r"(?<=\d)'(?=\d)")
82
+ FUNCTION_CALL_PATTERN = re.compile(r'([a-zA-Z0-9_:]+)\(\)')
83
+ CHRONO_PATTERN = re.compile(r'std::chrono::([a-zA-Z]+)\s*\{\s*(\d+)\s*\}')
84
+ CHRONO_PAREN_PATTERN = re.compile(r'(?:std::)?chrono::([a-zA-Z]+)\s*\(\s*([^)]+)\s*\)')
85
+ TIME_UNIT_PATTERN = re.compile(r'(\d+)\s*(min|s|ms|h)')
86
+ ADDRESS_PATTERN = re.compile(r'net::unresolved_address\s*\(\s*"?([^",]+)"?\s*,\s*([^)]+)\)')
87
+ KEYVAL_PATTERN = re.compile(r"'([^']+)':\s*'([^']+)'")
88
+ IDENTIFIER_PATTERN = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
89
+ SSTRING_PATTERN = re.compile(r'ss::sstring\{([a-zA-Z_][a-zA-Z0-9_]*)\}')
90
+ UNDERSCORE_PREFIX_PATTERN = re.compile(r"^_")
91
+
92
+ class ConstexprCache:
93
+ """
94
+ Cache for C++ constexpr identifier and function lookups to avoid repeated filesystem walks.
95
+
96
+ This class dramatically improves performance when processing large numbers of properties
97
+ by building a cache of all constexpr definitions once, then serving lookups from memory.
98
+
99
+ Performance Impact:
100
+ - Without cache: O(n*m) where n = properties, m = source files (thousands of filesystem operations)
101
+ - With cache: O(m + n) where cache build is O(m), lookups are O(1) (single filesystem walk)
102
+ """
103
+
104
+ def __init__(self):
105
+ self.constexpr_cache = {} # identifier -> value
106
+ self.function_cache = {} # function_name -> value
107
+ self.is_built = False
108
+ self.redpanda_source = None
109
+
110
+ def build_cache(self, redpanda_source=None):
111
+ """
112
+ Build the cache by walking the Redpanda source tree once and extracting all constexpr definitions.
113
+
114
+ Args:
115
+ redpanda_source (str, optional): Path to Redpanda source. If None, will be auto-detected.
116
+ """
117
+ if self.is_built and self.redpanda_source == redpanda_source:
118
+ return # Already built for this source
119
+
120
+ if not redpanda_source:
121
+ redpanda_source = find_redpanda_source()
122
+
123
+ if not redpanda_source:
124
+ logger.warning("Could not find Redpanda source directory for constexpr cache")
125
+ return
126
+
127
+ self.redpanda_source = redpanda_source
128
+ self.constexpr_cache.clear()
129
+ self.function_cache.clear()
130
+
131
+ # Constexpr identifier patterns
132
+ constexpr_patterns = [
133
+ re.compile(r'inline\s+constexpr\s+std::string_view\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\{\s*"([^"]+)"\s*\}'),
134
+ re.compile(r'constexpr\s+std::string_view\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\{\s*"([^"]+)"\s*\}'),
135
+ re.compile(r'inline\s+constexpr\s+auto\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"([^"]+)"'),
136
+ re.compile(r'constexpr\s+auto\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*"([^"]+)"'),
137
+ re.compile(r'static\s+constexpr\s+std::string_view\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\{\s*"([^"]+)"\s*\}'),
138
+ re.compile(r'static\s+inline\s+constexpr\s+std::string_view\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\{\s*"([^"]+)"\s*\}'),
139
+ ]
140
+
141
+ # General function patterns to extract ALL string-returning functions
142
+ # These patterns capture: namespace::function_name and the returned string
143
+ general_function_patterns = [
144
+ # Pattern: inline constexpr std::string_view name { "value" }
145
+ re.compile(r'inline\s+constexpr\s+std::string_view\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\{\s*"([^"]+)"\s*\}'),
146
+ # Pattern: constexpr std::string_view name { "value" }
147
+ re.compile(r'constexpr\s+std::string_view\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\{\s*"([^"]+)"\s*\}'),
148
+ # Pattern: inline std::string_view name() { return "value"; }
149
+ re.compile(r'inline\s+std::string_view\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*\)\s*\{\s*return\s*"([^"]+)"'),
150
+ # Pattern: std::string_view name() { return "value"; }
151
+ re.compile(r'std::string_view\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*\)\s*\{\s*return\s*"([^"]+)"'),
152
+ # Pattern: inline const model::topic name("value")
153
+ re.compile(r'inline\s+const\s+model::topic\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*"([^"]+)"\s*\)'),
154
+ # Pattern: const model::topic name("value")
155
+ re.compile(r'const\s+model::topic\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*"([^"]+)"\s*\)'),
156
+ # Pattern: inline const model::ns name("value")
157
+ re.compile(r'inline\s+const\s+model::ns\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*"([^"]+)"\s*\)'),
158
+ # Pattern: const model::ns name("value")
159
+ re.compile(r'const\s+model::ns\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*"([^"]+)"\s*\)'),
160
+ ]
161
+
162
+ # Legacy specific patterns (kept for compatibility, but general patterns should cover these)
163
+ function_patterns = {}
164
+
165
+ search_dirs = [
166
+ os.path.join(redpanda_source, 'src', 'v', 'model'), # For model:: functions
167
+ os.path.join(redpanda_source, 'src', 'v', 'config'),
168
+ os.path.join(redpanda_source, 'src', 'v', 'kafka'),
169
+ os.path.join(redpanda_source, 'src', 'v', 'security'),
170
+ os.path.join(redpanda_source, 'src', 'v', 'pandaproxy'),
171
+ ]
172
+
173
+ files_processed = 0
174
+ for search_dir in search_dirs:
175
+ if not os.path.exists(search_dir):
176
+ continue
177
+
178
+ for root, dirs, files in os.walk(search_dir):
179
+ for file in files:
180
+ if file.endswith(('.h', '.cc', '.hpp', '.cpp')):
181
+ file_path = os.path.join(root, file)
182
+ try:
183
+ with open(file_path, 'r', encoding='utf-8') as f:
184
+ content = f.read()
185
+
186
+ # Extract constexpr identifiers
187
+ for pattern in constexpr_patterns:
188
+ for match in pattern.finditer(content):
189
+ identifier = match.group(1)
190
+ value = match.group(2)
191
+ self.constexpr_cache[identifier] = value
192
+
193
+ # Extract ALL string-returning functions using general patterns
194
+ # This replaces hardcoded function patterns
195
+ for pattern in general_function_patterns:
196
+ for match in pattern.finditer(content):
197
+ func_name = match.group(1)
198
+ func_value = match.group(2)
199
+
200
+ # Try to determine namespace for the function
201
+ namespace = self._extract_namespace_for_function(content, match.start())
202
+
203
+ # Store with both simple name and qualified name
204
+ self.function_cache[func_name] = func_value
205
+ if namespace:
206
+ qualified_name = f"{namespace}::{func_name}"
207
+ self.function_cache[qualified_name] = func_value
208
+
209
+ # Legacy: Extract function definitions from hardcoded patterns (if any)
210
+ for func_name, patterns in function_patterns.items():
211
+ for pattern in patterns:
212
+ match = pattern.search(content)
213
+ if match:
214
+ self.function_cache[func_name] = match.group(1)
215
+ break
216
+
217
+ files_processed += 1
218
+
219
+ except (FileNotFoundError, PermissionError, OSError, UnicodeDecodeError) as e:
220
+ logger.debug(f"Error reading {file_path} for cache: {e}")
221
+ continue
222
+
223
+ self.is_built = True
224
+ logger.debug(f"Built constexpr cache: {len(self.constexpr_cache)} identifiers, "
225
+ f"{len(self.function_cache)} functions from {files_processed} files")
226
+
227
+ def _extract_namespace_for_function(self, content, position):
228
+ """
229
+ Extract the namespace at a given position in the file.
230
+
231
+ Args:
232
+ content (str): File content
233
+ position (int): Position in the file
234
+
235
+ Returns:
236
+ str: Namespace (e.g., "model" or "config::tls")
237
+ """
238
+ # Look backwards from position to find namespace declaration
239
+ preceding = content[:position]
240
+
241
+ # Find all namespace declarations before this position
242
+ namespace_pattern = re.compile(r'namespace\s+(\w+)\s*\{')
243
+ namespaces = []
244
+
245
+ for match in namespace_pattern.finditer(preceding):
246
+ ns_name = match.group(1)
247
+ # Check if we're still inside this namespace by tracking brace depth
248
+ # Start with depth=1 (we entered the namespace with its opening brace)
249
+ after_ns = content[match.end():position]
250
+ brace_depth = 1
251
+
252
+ for char in after_ns:
253
+ if char == '{':
254
+ brace_depth += 1
255
+ elif char == '}':
256
+ brace_depth -= 1
257
+ if brace_depth == 0:
258
+ # Namespace was closed before reaching current position
259
+ break
260
+
261
+ if brace_depth > 0:
262
+ # Still inside this namespace
263
+ namespaces.append(ns_name)
264
+
265
+ return '::'.join(namespaces) if namespaces else ''
266
+
267
+ def lookup_constexpr(self, identifier):
268
+ """
269
+ Look up a constexpr identifier value from the cache.
270
+
271
+ Args:
272
+ identifier (str): The identifier to look up
273
+
274
+ Returns:
275
+ str or None: The resolved value if found, None otherwise
276
+ """
277
+ if not self.is_built:
278
+ self.build_cache()
279
+
280
+ return self.constexpr_cache.get(identifier)
281
+
282
+ def lookup_function(self, function_name):
283
+ """
284
+ Look up a function call result from the cache.
285
+
286
+ Args:
287
+ function_name (str): The function name to look up
288
+
289
+ Returns:
290
+ str or None: The resolved value if found, None otherwise
291
+ """
292
+ if not self.is_built:
293
+ self.build_cache()
294
+
295
+ return self.function_cache.get(function_name)
296
+
297
+ # Global cache instance
298
+ _constexpr_cache = ConstexprCache()
299
+
300
+ # Global storage for type definitions (used by transformers for enum mapping)
301
+ _type_definitions_cache = {}
72
302
 
73
303
  # Import topic property extractor
304
+ def find_redpanda_source():
305
+ """
306
+ Locate the Redpanda source directory by searching standard locations.
307
+
308
+ The property extractor looks for the Redpanda source code in multiple
309
+ locations to handle different execution contexts (project root, tools directory, etc.).
310
+
311
+ Returns:
312
+ str or None: Path to the Redpanda source directory if found, None otherwise.
313
+ """
314
+ redpanda_source_paths = [
315
+ 'tmp/redpanda', # Current directory
316
+ '../tmp/redpanda', # Parent directory
317
+ 'tools/property-extractor/tmp/redpanda', # From project root
318
+ os.path.join(os.getcwd(), 'tools', 'property-extractor', 'tmp', 'redpanda')
319
+ ]
320
+
321
+ for path in redpanda_source_paths:
322
+ if os.path.exists(path):
323
+ return path
324
+
325
+ return None
326
+
327
+ def safe_arithmetic_eval(expression):
328
+ """
329
+ Safely evaluate simple arithmetic expressions like '60 * 5' without using eval().
330
+ Only allows basic operators: +, -, *, /, //, %, and **
331
+ Only works with integers and basic arithmetic.
332
+
333
+ Returns the result if successful, raises ValueError if unsafe or invalid.
334
+ """
335
+ # Only allow safe characters: digits, spaces, and basic operators
336
+ allowed_chars = set('0123456789+-*/%() ')
337
+ if not all(c in allowed_chars for c in expression):
338
+ raise ValueError("Expression contains unsafe characters")
339
+
340
+ # Simple operator mapping for basic arithmetic
341
+ ops = {
342
+ '+': operator.add,
343
+ '-': operator.sub,
344
+ '*': operator.mul,
345
+ '/': operator.truediv,
346
+ '//': operator.floordiv,
347
+ '%': operator.mod,
348
+ }
349
+
350
+ # For simple cases like "60 * 5", handle directly
351
+ for op_str, op_func in ops.items():
352
+ if op_str in expression:
353
+ parts = expression.split(op_str)
354
+ if len(parts) == 2:
355
+ try:
356
+ left = int(parts[0].strip())
357
+ right = int(parts[1].strip())
358
+ return int(op_func(left, right))
359
+ except (ValueError, ZeroDivisionError):
360
+ pass
361
+
362
+ # If it's just a number, return it
363
+ try:
364
+ return int(expression.strip())
365
+ except ValueError:
366
+ pass
367
+
368
+ raise ValueError("Could not safely evaluate expression")
369
+
74
370
  try:
75
371
  from topic_property_extractor import TopicPropertyExtractor
76
372
  except ImportError:
@@ -116,29 +412,11 @@ def process_enterprise_value(enterprise_str):
116
412
  Union[str, bool, list]: A JSON-serializable representation of the input.
117
413
  """
118
414
  enterprise_str = enterprise_str.strip()
119
-
120
- # Handle special SASL mechanism function names
121
- if enterprise_str == "is_enterprise_sasl_mechanism":
122
- # Dynamically look up enterprise SASL mechanisms from source
123
- enterprise_mechanisms = get_enterprise_sasl_mechanisms()
124
- if enterprise_mechanisms:
125
- return enterprise_mechanisms
126
- else:
127
- # Fallback to known values if lookup fails
128
- return ["GSSAPI", "OAUTHBEARER"]
129
- elif enterprise_str == "is_enterprise_sasl_mechanisms_override":
130
- # Get the enterprise mechanisms dynamically for a more accurate description
131
- enterprise_mechanisms = get_enterprise_sasl_mechanisms()
132
- if enterprise_mechanisms:
133
- mechanism_list = ", ".join(enterprise_mechanisms)
134
- return f"Any override containing enterprise mechanisms ({mechanism_list})."
135
- else:
136
- return "Any override containing enterprise mechanisms."
137
-
415
+
138
416
  # FIRST: Handle std::vector initialization patterns (highest priority)
139
417
  # This must come before enum processing because vectors can contain enums
140
418
  # Tolerate optional whitespace around braces
141
- vector_match = re.match(r'std::vector<[^>]+>\s*\{\s*([^}]*)\s*\}', enterprise_str)
419
+ vector_match = VECTOR_PATTERN.match(enterprise_str)
142
420
  if vector_match:
143
421
  content = vector_match.group(1).strip()
144
422
  if not content:
@@ -161,7 +439,7 @@ def process_enterprise_value(enterprise_str):
161
439
  values.append(ast.literal_eval(value))
162
440
  else:
163
441
  # Handle enum values in the vector
164
- enum_match = re.match(r'[a-zA-Z0-9_:]+::([a-zA-Z0-9_]+)', value)
442
+ enum_match = ENUM_PATTERN.match(value)
165
443
  if enum_match:
166
444
  values.append(enum_match.group(1))
167
445
  else:
@@ -177,7 +455,7 @@ def process_enterprise_value(enterprise_str):
177
455
  values.append(ast.literal_eval(value))
178
456
  else:
179
457
  # Handle enum values in the vector
180
- enum_match = re.match(r'[a-zA-Z0-9_:]+::([a-zA-Z0-9_]+)', value)
458
+ enum_match = ENUM_PATTERN.match(value)
181
459
  if enum_match:
182
460
  values.append(enum_match.group(1))
183
461
  else:
@@ -186,18 +464,16 @@ def process_enterprise_value(enterprise_str):
186
464
  return values
187
465
 
188
466
  # SECOND: Handle enum-like patterns (extract the last part after ::)
189
- enum_match = re.match(r'[a-zA-Z0-9_:]+::([a-zA-Z0-9_]+)', enterprise_str)
467
+ enum_match = ENUM_PATTERN.match(enterprise_str)
190
468
  if enum_match:
191
469
  enum_value = enum_match.group(1)
192
470
  return enum_value
193
471
 
194
472
  # THIRD: Handle C++ lambda expressions - these usually indicate "any non-default value"
195
473
  if enterprise_str.startswith("[](") and enterprise_str.endswith("}"):
196
- # For lambda expressions, try to extract meaningful info from the logic
197
- if "leaders_preference" in enterprise_str:
198
- return "Any rack preference (not `none`)"
199
- else:
200
- return "Enterprise feature enabled"
474
+ # For lambda expressions, return a generic message
475
+ # No hardcoded logic for specific properties
476
+ return "Enterprise feature enabled"
201
477
 
202
478
  # FOURTH: Handle simple values with proper JSON types
203
479
  # Convert boolean literals to actual boolean values for JSON compatibility
@@ -214,168 +490,79 @@ def process_enterprise_value(enterprise_str):
214
490
 
215
491
  def resolve_cpp_function_call(function_name):
216
492
  """
217
- Resolve a small set of known zero-argument C++ functions to their literal string return values by scanning a local Redpanda source tree.
218
-
219
- Searches predefined files and regex patterns for the specified fully-qualified function name (e.g., "model::kafka_audit_logging_topic") and returns the captured string if found; returns None when no match or when the Redpanda source tree cannot be located.
220
-
493
+ Resolve zero-argument C++ functions to their literal string return values.
494
+
495
+ Uses the pre-built ConstexprCache which dynamically extracts ALL string-returning
496
+ functions from source using general patterns. No hardcoded patterns needed.
497
+
221
498
  Parameters:
222
- function_name (str): Fully-qualified C++ function name to resolve.
223
-
499
+ function_name (str): Fully-qualified C++ function name to resolve (e.g., "model::kafka_audit_logging_topic")
500
+
224
501
  Returns:
225
- str or None: The literal string returned by the C++ function when resolved, or `None` if unresolved.
502
+ str or None: The literal string returned by the C++ function, or None if not found in cache
226
503
  """
227
- # Map function names to likely search patterns and file locations
228
- search_patterns = {
229
- 'model::kafka_audit_logging_topic': {
230
- 'patterns': [
231
- r'inline\s+const\s+model::topic\s+kafka_audit_logging_topic\s*\(\s*"([^"]+)"\s*\)',
232
- r'const\s+model::topic\s+kafka_audit_logging_topic\s*\(\s*"([^"]+)"\s*\)',
233
- r'model::topic\s+kafka_audit_logging_topic\s*\(\s*"([^"]+)"\s*\)',
234
- r'std::string_view\s+kafka_audit_logging_topic\s*\(\s*\)\s*\{\s*return\s*"([^"]+)"',
235
- r'inline\s+std::string_view\s+kafka_audit_logging_topic\s*\(\s*\)\s*\{\s*return\s*"([^"]+)"'
236
- ],
237
- 'files': ['src/v/model/namespace.h', 'src/v/model/namespace.cc', 'src/v/model/kafka_namespace.h']
238
- },
239
- 'model::kafka_consumer_offsets_topic': {
240
- 'patterns': [
241
- r'inline\s+const\s+model::topic\s+kafka_consumer_offsets_topic\s*\(\s*"([^"]+)"\s*\)',
242
- r'const\s+model::topic\s+kafka_consumer_offsets_topic\s*\(\s*"([^"]+)"\s*\)',
243
- r'model::topic\s+kafka_consumer_offsets_topic\s*\(\s*"([^"]+)"\s*\)',
244
- r'std::string_view\s+kafka_consumer_offsets_topic\s*\(\s*\)\s*\{\s*return\s*"([^"]+)"',
245
- r'inline\s+std::string_view\s+kafka_consumer_offsets_topic\s*\(\s*\)\s*\{\s*return\s*"([^"]+)"'
246
- ],
247
- 'files': ['src/v/model/namespace.h', 'src/v/model/namespace.cc', 'src/v/model/kafka_namespace.h']
248
- },
249
- 'model::kafka_internal_namespace': {
250
- 'patterns': [
251
- r'inline\s+const\s+model::ns\s+kafka_internal_namespace\s*\(\s*"([^"]+)"\s*\)',
252
- r'const\s+model::ns\s+kafka_internal_namespace\s*\(\s*"([^"]+)"\s*\)',
253
- r'model::ns\s+kafka_internal_namespace\s*\(\s*"([^"]+)"\s*\)',
254
- r'std::string_view\s+kafka_internal_namespace\s*\(\s*\)\s*\{\s*return\s*"([^"]+)"',
255
- r'inline\s+std::string_view\s+kafka_internal_namespace\s*\(\s*\)\s*\{\s*return\s*"([^"]+)"'
256
- ],
257
- 'files': ['src/v/model/namespace.h', 'src/v/model/namespace.cc', 'src/v/model/kafka_namespace.h']
258
- }
259
- }
260
-
261
- # Check if we have search patterns for this function
262
- if function_name not in search_patterns:
263
- logger.debug(f"No search patterns defined for function: {function_name}")
264
- return None
265
-
266
- config = search_patterns[function_name]
267
-
268
- # Try to find the Redpanda source directory
269
- # Look for it in the standard locations used by the property extractor
270
- redpanda_source_paths = [
271
- 'tmp/redpanda', # Current directory
272
- '../tmp/redpanda', # Parent directory
273
- 'tools/property-extractor/tmp/redpanda', # From project root
274
- os.path.join(os.getcwd(), 'tools', 'property-extractor', 'tmp', 'redpanda')
275
- ]
276
-
277
- redpanda_source = None
278
- for path in redpanda_source_paths:
279
- if os.path.exists(path):
280
- redpanda_source = path
281
- break
282
-
283
- if not redpanda_source:
284
- logger.warning(f"Could not find Redpanda source directory to resolve function: {function_name}")
285
- return None
286
-
287
- # Search in the specified files
288
- for file_path in config['files']:
289
- full_path = os.path.join(redpanda_source, file_path)
290
- if not os.path.exists(full_path):
291
- continue
292
-
293
- try:
294
- with open(full_path, 'r', encoding='utf-8') as f:
295
- content = f.read()
296
-
297
- # Try each pattern
298
- for pattern in config['patterns']:
299
- match = re.search(pattern, content, re.MULTILINE | re.DOTALL)
300
- if match:
301
- resolved_value = match.group(1)
302
- logger.debug(f"Resolved {function_name}() -> '{resolved_value}' from {file_path}")
303
- return resolved_value
304
-
305
- except Exception as e:
306
- logger.debug(f"Error reading {full_path}: {e}")
307
- continue
308
-
309
- # If not found in specific files, do a broader search
310
- logger.debug(f"Function {function_name} not found in expected files, doing broader search...")
311
-
312
- # Search more broadly in the model directory
313
- model_dir = os.path.join(redpanda_source, 'src', 'v', 'model')
314
- if os.path.exists(model_dir):
315
- for root, dirs, files in os.walk(model_dir):
316
- for file in files:
317
- if file.endswith('.h') or file.endswith('.cc'):
318
- file_path = os.path.join(root, file)
319
- try:
320
- with open(file_path, 'r', encoding='utf-8') as f:
321
- content = f.read()
322
-
323
- # Try patterns for this file
324
- for pattern in config['patterns']:
325
- match = re.search(pattern, content, re.MULTILINE | re.DOTALL)
326
- if match:
327
- resolved_value = match.group(1)
328
- logger.debug(f"Resolved {function_name}() -> '{resolved_value}' from {file_path}")
329
- return resolved_value
330
-
331
- except Exception as e:
332
- logger.debug(f"Error reading {file_path}: {e}")
333
- continue
334
-
335
- logger.warning(f"Could not resolve function call: {function_name}()")
504
+ # Look up function in the pre-built cache
505
+ # The cache was populated by ConstexprCache.build_cache() with general patterns
506
+ # that automatically discover ALL string-returning functions
507
+ cached_result = _constexpr_cache.lookup_function(function_name)
508
+ if cached_result is not None:
509
+ logger.debug(f"Resolved function '{function_name}' -> '{cached_result}' from cache")
510
+ return cached_result
511
+
512
+ # Also try without namespace qualifier (e.g., "kafka_audit_logging_topic")
513
+ if '::' in function_name:
514
+ simple_name = function_name.split('::')[-1]
515
+ cached_result = _constexpr_cache.lookup_function(simple_name)
516
+ if cached_result is not None:
517
+ logger.debug(f"Resolved function '{function_name}' (as '{simple_name}') -> '{cached_result}' from cache")
518
+ return cached_result
519
+
520
+ logger.debug(f"Function '{function_name}' not found in cache")
336
521
  return None
337
522
 
338
523
 
339
524
  def resolve_constexpr_identifier(identifier):
340
525
  """
341
526
  Resolve a constexpr identifier from Redpanda source code to its literal string value.
342
-
527
+
528
+ Uses a cache to avoid repeated filesystem walks for better performance.
343
529
  Searches common Redpanda source locations for constexpr string or string_view definitions matching the given identifier and returns the literal if found.
344
-
530
+
345
531
  Parameters:
346
- identifier (str): The identifier name to resolve (e.g., "scram").
347
-
532
+ identifier (str): The identifier name to resolve (e.g., "scram" or "net::tls_v1_2_cipher_suites").
533
+
348
534
  Returns:
349
535
  str or None: The resolved literal string value if found, otherwise `None`.
350
536
  """
351
- # Try to find the Redpanda source directory
352
- redpanda_source_paths = [
353
- 'tmp/redpanda', # Current directory
354
- '../tmp/redpanda', # Parent directory
355
- 'tools/property-extractor/tmp/redpanda', # From project root
356
- os.path.join(os.getcwd(), 'tools', 'property-extractor', 'tmp', 'redpanda')
357
- ]
358
-
359
- redpanda_source = None
360
- for path in redpanda_source_paths:
361
- if os.path.exists(path):
362
- redpanda_source = path
363
- break
364
-
537
+ # Try cache lookup first (much faster)
538
+ cached_result = _constexpr_cache.lookup_constexpr(identifier)
539
+ if cached_result is not None:
540
+ logger.debug(f"Resolved identifier '{identifier}' -> '{cached_result}' from cache")
541
+ return cached_result
542
+
543
+ # Fallback to original filesystem search for compatibility
544
+ redpanda_source = find_redpanda_source()
365
545
  if not redpanda_source:
366
546
  logger.debug(f"Could not find Redpanda source directory to resolve identifier: {identifier}")
367
547
  return None
548
+
549
+ # Strip namespace qualifier if present (e.g., "net::tls_v1_2_cipher_suites" -> "tls_v1_2_cipher_suites")
550
+ search_identifier = identifier.split('::')[-1] if '::' in identifier else identifier
368
551
 
369
552
  # Pattern to match constexpr string_view definitions
370
553
  # Matches: inline constexpr std::string_view scram{"SCRAM"};
371
554
  patterns = [
372
- rf'inline\s+constexpr\s+std::string_view\s+{re.escape(identifier)}\s*\{{\s*"([^"]+)"\s*\}}',
373
- rf'constexpr\s+std::string_view\s+{re.escape(identifier)}\s*\{{\s*"([^"]+)"\s*\}}',
374
- rf'inline\s+constexpr\s+auto\s+{re.escape(identifier)}\s*=\s*"([^"]+)"',
375
- rf'constexpr\s+auto\s+{re.escape(identifier)}\s*=\s*"([^"]+)"',
376
- rf'static\s+constexpr\s+std::string_view\s+{re.escape(identifier)}\s*\{{\s*"([^"]+)"\s*\}}',
377
- rf'static\s+inline\s+constexpr\s+std::string_view\s+{re.escape(identifier)}\s*\{{\s*"([^"]+)"\s*\}}',
555
+ rf'inline\s+constexpr\s+std::string_view\s+{re.escape(search_identifier)}\s*\{{\s*"([^"]+)"\s*\}}',
556
+ rf'constexpr\s+std::string_view\s+{re.escape(search_identifier)}\s*\{{\s*"([^"]+)"\s*\}}',
557
+ rf'inline\s+constexpr\s+auto\s+{re.escape(search_identifier)}\s*=\s*"([^"]+)"',
558
+ rf'constexpr\s+auto\s+{re.escape(search_identifier)}\s*=\s*"([^"]+)"',
559
+ rf'static\s+constexpr\s+std::string_view\s+{re.escape(search_identifier)}\s*\{{\s*"([^"]+)"\s*\}}',
560
+ rf'static\s+inline\s+constexpr\s+std::string_view\s+{re.escape(search_identifier)}\s*\{{\s*"([^"]+)"\s*\}}',
378
561
  ]
562
+
563
+ # Pattern for multi-line concatenated string constants (like TLS cipher suites)
564
+ # Matches: const std::string_view identifier = "line1"\n "line2"\n...;
565
+ multiline_pattern = rf'(?:const|extern\s+const)\s+std::string_view\s+{re.escape(search_identifier)}\s*=\s*((?:"[^"]*"\s*)+);'
379
566
 
380
567
  # Search recursively through the config directory and other common locations
381
568
  search_dirs = [
@@ -383,6 +570,7 @@ def resolve_constexpr_identifier(identifier):
383
570
  os.path.join(redpanda_source, 'src', 'v', 'kafka'),
384
571
  os.path.join(redpanda_source, 'src', 'v', 'security'),
385
572
  os.path.join(redpanda_source, 'src', 'v', 'pandaproxy'),
573
+ os.path.join(redpanda_source, 'src', 'v', 'net'), # For TLS cipher suites and network constants
386
574
  ]
387
575
 
388
576
  for search_dir in search_dirs:
@@ -399,13 +587,24 @@ def resolve_constexpr_identifier(identifier):
399
587
  with open(file_path, 'r', encoding='utf-8') as f:
400
588
  content = f.read()
401
589
 
402
- # Try each pattern
590
+ # Try each single-line pattern first
403
591
  for pattern in patterns:
404
592
  match = re.search(pattern, content, re.MULTILINE)
405
593
  if match:
406
594
  resolved_value = match.group(1)
407
595
  logger.debug(f"Resolved identifier '{identifier}' -> '{resolved_value}' from {file_path}")
408
596
  return resolved_value
597
+
598
+ # Try multi-line concatenated string pattern (for TLS cipher suites, etc.)
599
+ multiline_match = re.search(multiline_pattern, content, re.MULTILINE | re.DOTALL)
600
+ if multiline_match:
601
+ # Extract all quoted strings and concatenate them
602
+ strings_section = multiline_match.group(1)
603
+ string_literals = re.findall(r'"([^"]*)"', strings_section)
604
+ if string_literals:
605
+ resolved_value = ''.join(string_literals)
606
+ logger.debug(f"Resolved multi-line identifier '{identifier}' -> '{resolved_value[:50]}...' from {file_path}")
607
+ return resolved_value
409
608
 
410
609
  except (FileNotFoundError, PermissionError, OSError, UnicodeDecodeError) as e:
411
610
  logger.debug(f"Error reading {file_path}: {e}")
@@ -415,82 +614,6 @@ def resolve_constexpr_identifier(identifier):
415
614
  return None
416
615
 
417
616
 
418
- def get_enterprise_sasl_mechanisms():
419
- """
420
- Locate and resolve enterprise SASL mechanisms declared in Redpanda's sasl_mechanisms.h.
421
-
422
- Searches known Redpanda source locations for an inline constexpr definition of enterprise_sasl_mechanisms,
423
- extracts the identifiers, and resolves each identifier to its literal string value where possible; unresolved
424
- identifiers are converted to an uppercase fallback.
425
-
426
- Returns:
427
- list or None: List of enterprise SASL mechanism strings (e.g., ["GSSAPI", "OAUTHBEARER"]),
428
- or `None` if the lookup fails.
429
- """
430
- # Try to find the Redpanda source directory
431
- redpanda_source_paths = [
432
- 'tmp/redpanda', # Current directory
433
- '../tmp/redpanda', # Parent directory
434
- 'tools/property-extractor/tmp/redpanda', # From project root
435
- os.path.join(os.getcwd(), 'tools', 'property-extractor', 'tmp', 'redpanda')
436
- ]
437
-
438
- redpanda_source = None
439
- for path in redpanda_source_paths:
440
- if os.path.exists(path):
441
- redpanda_source = path
442
- break
443
-
444
- if not redpanda_source:
445
- logger.debug("Could not find Redpanda source directory to resolve enterprise SASL mechanisms")
446
- return None
447
-
448
- # Look for the enterprise_sasl_mechanisms definition in sasl_mechanisms.h
449
- sasl_mechanisms_file = os.path.join(redpanda_source, 'src', 'v', 'config', 'sasl_mechanisms.h')
450
-
451
- if not os.path.exists(sasl_mechanisms_file):
452
- logger.debug(f"sasl_mechanisms.h not found at {sasl_mechanisms_file}")
453
- return None
454
-
455
- try:
456
- with open(sasl_mechanisms_file, 'r', encoding='utf-8') as f:
457
- content = f.read()
458
-
459
- # Pattern to match the enterprise_sasl_mechanisms array definition
460
- # inline constexpr auto enterprise_sasl_mechanisms = std::to_array<std::string_view>({gssapi, oauthbearer});
461
- pattern = r'inline\s+constexpr\s+auto\s+enterprise_sasl_mechanisms\s*=\s*std::to_array<[^>]+>\s*\(\s*\{\s*([^}]+)\s*\}\s*\)'
462
-
463
- match = re.search(pattern, content, re.MULTILINE | re.DOTALL)
464
- if match:
465
- # Extract the identifiers from the array (e.g., "gssapi, oauthbearer")
466
- identifiers_str = match.group(1).strip()
467
-
468
- # Split by comma and clean up whitespace
469
- identifiers = [id.strip() for id in identifiers_str.split(',') if id.strip()]
470
-
471
- # Resolve each identifier to its actual string value
472
- mechanisms = []
473
- for identifier in identifiers:
474
- resolved_value = resolve_constexpr_identifier(identifier)
475
- if resolved_value:
476
- mechanisms.append(resolved_value)
477
- else:
478
- logger.debug(f"Could not resolve SASL mechanism identifier: {identifier}")
479
- # Fallback: use the identifier name in uppercase
480
- mechanisms.append(identifier.upper())
481
-
482
- if mechanisms:
483
- logger.debug(f"Resolved enterprise SASL mechanisms: {mechanisms}")
484
- return mechanisms
485
- else:
486
- logger.debug("Could not find enterprise_sasl_mechanisms definition in sasl_mechanisms.h")
487
- return None
488
-
489
- except (OSError, UnicodeDecodeError, re.error) as e:
490
- logger.debug(f"Error reading {sasl_mechanisms_file}: {e}")
491
- return None
492
-
493
-
494
617
  def validate_paths(options):
495
618
  """
496
619
  Validate that required file-system paths referenced by `options` exist and exit the process on failure.
@@ -520,7 +643,7 @@ def validate_paths(options):
520
643
  def get_file_pairs(options):
521
644
  path = Path(options.path)
522
645
 
523
- file_iter = path.rglob("*.h") if options.recursive else path.rglob("*.h")
646
+ file_iter = path.rglob("*.h") if options.recursive else path.glob("*.h")
524
647
 
525
648
  file_pairs = []
526
649
 
@@ -573,8 +696,19 @@ def get_files_with_properties(file_pairs, treesitter_parser, cpp_language):
573
696
 
574
697
  def transform_files_with_properties(files_with_properties):
575
698
  type_transformer = TypeTransformer()
699
+
700
+ # Initialize ConstantResolver for validator enum extraction
701
+ redpanda_src = find_redpanda_source()
702
+ constant_resolver = None
703
+ if redpanda_src:
704
+ src_v_path = Path(redpanda_src) / 'src' / 'v'
705
+ if src_v_path.exists():
706
+ constant_resolver = ConstantResolver(src_v_path)
707
+ logger.debug(f"Initialized ConstantResolver with path: {src_v_path}")
708
+
576
709
  transformers = [
577
710
  EnterpriseTransformer(), ## this must be the first, as it modifies current data
711
+ ParamNormalizerTransformer(),
578
712
  TypeTransformer(),
579
713
  MetaParamTransformer(),
580
714
  BasicInfoTransformer(),
@@ -585,6 +719,7 @@ def transform_files_with_properties(files_with_properties):
585
719
  VisibilityTransformer(),
586
720
  DeprecatedTransformer(),
587
721
  IsSecretTransformer(),
722
+ ExampleTransformer(),
588
723
  NumericBoundsTransformer(type_transformer),
589
724
  DurationBoundsTransformer(type_transformer),
590
725
  SimpleDefaultValuesTransformer(),
@@ -593,14 +728,20 @@ def transform_files_with_properties(files_with_properties):
593
728
  AliasTransformer(),
594
729
  ]
595
730
 
731
+ # Add enum extractors if we have a constant_resolver
732
+ if constant_resolver:
733
+ transformers.append(ValidatorEnumExtractor(constant_resolver))
734
+ transformers.append(RuntimeValidationEnumExtractor(constant_resolver))
735
+
596
736
  all_properties = PropertyBag()
597
737
 
598
738
  for fp, properties in files_with_properties:
599
739
  for name in properties:
600
740
  # ignore private properties
601
- if re.match(r"^_", name):
741
+ if UNDERSCORE_PREFIX_PATTERN.match(name):
602
742
  continue
603
743
 
744
+
604
745
  property_definition = PropertyBag()
605
746
 
606
747
  for transformer in transformers:
@@ -613,6 +754,143 @@ def transform_files_with_properties(files_with_properties):
613
754
  return all_properties
614
755
 
615
756
 
757
+ def apply_transformers_to_topic_properties(topic_properties):
758
+ """
759
+ Apply transformers to topic properties that were extracted separately.
760
+ This ensures topic properties get the same metadata as cluster properties.
761
+ """
762
+ if not topic_properties:
763
+ return topic_properties
764
+
765
+ type_transformer = TypeTransformer()
766
+ transformers = [
767
+ # Apply selected transformers that are relevant for topic properties
768
+ NeedsRestartTransformer(), # This is the key one we need for needs_restart field
769
+ VisibilityTransformer(),
770
+ DeprecatedTransformer(),
771
+ IsSecretTransformer(),
772
+ ExperimentalTransformer(),
773
+ EnterpriseTransformer(), # Need this to set is_enterprise field
774
+ ]
775
+
776
+ transformed_properties = PropertyBag()
777
+
778
+ for prop_name, prop_data in topic_properties.items():
779
+ property_definition = PropertyBag(prop_data) # Start with existing data
780
+
781
+ # Create a mock file path for topic properties
782
+ mock_fp = "topic_properties"
783
+
784
+ # Create a mock properties dict that transformers expect
785
+ mock_properties = {prop_name: property_definition}
786
+
787
+ for transformer in transformers:
788
+ if transformer.accepts(property_definition, mock_fp):
789
+ transformer.parse(property_definition, property_definition, mock_fp)
790
+
791
+ transformed_properties[prop_name] = property_definition
792
+
793
+ logging.info(f"Applied transformers to {len(transformed_properties)} topic properties")
794
+ return transformed_properties
795
+
796
+
797
+ def filter_referenced_definitions(properties, definitions):
798
+ """
799
+ Filter definitions to only include types that are actually referenced by properties.
800
+
801
+ Performs transitive closure: if type A references type B, both are included.
802
+ This significantly reduces the size of the definitions section.
803
+
804
+ Args:
805
+ properties: Dict of property definitions
806
+ definitions: Dict of all type definitions
807
+
808
+ Returns:
809
+ dict: Filtered definitions containing only referenced types
810
+ """
811
+ referenced = set()
812
+
813
+ def collect_references(obj, visited=None):
814
+ """Recursively collect type references from properties and definitions."""
815
+ if visited is None:
816
+ visited = set()
817
+
818
+ if isinstance(obj, dict):
819
+ # Check for $ref
820
+ if '$ref' in obj:
821
+ ref = obj['$ref']
822
+ if ref.startswith('#/definitions/'):
823
+ type_name = ref.replace('#/definitions/', '')
824
+ if type_name not in visited:
825
+ referenced.add(type_name)
826
+ visited.add(type_name)
827
+ # Recursively collect references from this definition
828
+ if type_name in definitions:
829
+ collect_references(definitions[type_name], visited)
830
+
831
+ # Check for c_type
832
+ if 'c_type' in obj:
833
+ type_name = obj['c_type']
834
+ if type_name and type_name in definitions and type_name not in visited:
835
+ referenced.add(type_name)
836
+ visited.add(type_name)
837
+ collect_references(definitions[type_name], visited)
838
+
839
+ # Recurse into nested objects
840
+ for value in obj.values():
841
+ collect_references(value, visited)
842
+
843
+ elif isinstance(obj, list):
844
+ for item in obj:
845
+ collect_references(item, visited)
846
+
847
+ # Collect all references from properties
848
+ collect_references(properties)
849
+
850
+ # Filter definitions to only referenced types
851
+ filtered = {k: v for k, v in definitions.items() if k in referenced}
852
+
853
+ logger.info(f"📉 Filtered definitions from {len(definitions)} to {len(filtered)} (only referenced types)")
854
+
855
+ return filtered
856
+
857
+
858
+ def clean_private_fields_from_definitions(definitions):
859
+ """
860
+ Remove private fields (those starting with _) from definition properties.
861
+ This keeps the JSON output clean by only exposing public API.
862
+
863
+ Args:
864
+ definitions: Dictionary of type definitions
865
+
866
+ Returns:
867
+ Dictionary with private fields filtered out
868
+ """
869
+ cleaned = {}
870
+ total_private_fields = 0
871
+
872
+ for def_name, def_data in definitions.items():
873
+ if 'properties' in def_data and def_data['properties']:
874
+ # Filter out fields starting with underscore
875
+ original_props = def_data['properties']
876
+ cleaned_props = {k: v for k, v in original_props.items() if not k.startswith('_')}
877
+
878
+ private_count = len(original_props) - len(cleaned_props)
879
+ total_private_fields += private_count
880
+
881
+ # Only include definitions that have at least one public field
882
+ if cleaned_props:
883
+ cleaned[def_name] = {**def_data, 'properties': cleaned_props}
884
+ else:
885
+ # Keep definitions without properties (like enums)
886
+ cleaned[def_name] = def_data
887
+
888
+ if total_private_fields > 0:
889
+ logger.info(f"🧹 Cleaned {total_private_fields} private fields from definitions")
890
+
891
+ return cleaned
892
+
893
+
616
894
  # The definitions.json file contains type definitions that the extractor uses to standardize and centralize type information. After extracting and transforming the properties from the source code, the function merge_properties_and_definitions looks up each property's type in the definitions. If a property's type (or the type of its items, in the case of arrays) matches one of the definitions, the transformer replaces that type with a JSON pointer ( such as #/definitions/<type>) to the corresponding entry in definitions.json. The final JSON output then includes both a properties section (with types now referencing the definitions) and a definitions section, so that consumers of the output can easily resolve the full type information.
617
895
  def merge_properties_and_definitions(properties, definitions):
618
896
  # Do not overwrite the resolved type/default with a reference. Just return the resolved properties and definitions.
@@ -621,67 +899,45 @@ def merge_properties_and_definitions(properties, definitions):
621
899
 
622
900
  def apply_property_overrides(properties, overrides, overrides_file_path=None):
623
901
  """
624
- Apply property overrides from the overrides JSON file to enhance property documentation.
625
-
626
- This function allows customizing property documentation by providing overrides for:
627
-
628
- 1. description: Override the auto-extracted property description with custom text
629
- 2. version: Add version information showing when the property was introduced
630
- 3. example: Add AsciiDoc example sections with flexible input formats (see below)
631
- 4. default: Override the auto-extracted default value
632
- 5. related_topics: Add an array of related topic links for cross-referencing
633
- 6. config_scope: Specify the scope for new properties ("topic", "cluster", "broker")
634
- 7. type: Specify the type for new properties
635
-
636
- Properties that don't exist in the extracted source can be created from overrides.
637
- This is useful for topic properties or other configurations that aren't auto-detected.
638
-
639
- Multiple example input formats are supported for user convenience:
640
-
641
- 1. Direct AsciiDoc string:
642
- "example": ".Example\n[,yaml]\n----\nredpanda:\n property_name: value\n----"
643
-
644
- 2. Multi-line array (each element becomes a line):
645
- "example": [
646
- ".Example",
647
- "[,yaml]",
648
- "----",
649
- "redpanda:",
650
- " property_name: value",
651
- "----"
652
- ]
653
-
654
- 3. External file reference:
655
- "example_file": "examples/property_name.adoc"
656
-
657
- 4. Auto-formatted YAML with title and description:
658
- "example_yaml": {
659
- "title": "Example Configuration",
660
- "description": "This shows how to configure the property.",
661
- "config": {
662
- "redpanda": {
663
- "property_name": "value"
664
- }
665
- }
666
- }
667
-
668
- Args:
669
- properties: Dictionary of extracted properties from C++ source
670
- overrides: Dictionary loaded from overrides JSON file
671
- overrides_file_path: Path to the overrides file (for resolving relative example_file paths)
672
-
902
+ Apply overrides from an overrides mapping to the extracted properties, mutating and returning the properties dictionary.
903
+
904
+ Processes entries in overrides["properties"]; for each override key the function:
905
+ - If the key matches a property dictionary key, applies the override to that property.
906
+ - Otherwise, searches existing properties for an entry whose `"name"` equals the override key and applies the override if found.
907
+ - If no matching property is found, creates a new property from the override and adds it under the override key.
908
+
909
+ The function supports overrides that add or replace description, version, example content, default, type, config_scope, related_topics, and other metadata. When examples reference external files, relative paths are resolved relative to overrides_file_path.
910
+
911
+ Parameters:
912
+ properties (dict): Mapping of existing property entries (modified in-place).
913
+ overrides (dict): Loaded overrides structure; only keys under "properties" are processed.
914
+ overrides_file_path (str|None): Filesystem path of the overrides file used to resolve relative example_file references.
915
+
673
916
  Returns:
674
- Updated properties dictionary with overrides applied and new properties created
917
+ dict: The same properties mapping with overrides applied and any new properties created.
675
918
  """
676
919
  if overrides and "properties" in overrides:
677
920
  for prop, override in overrides["properties"].items():
921
+ # First check if property exists by key
678
922
  if prop in properties:
679
923
  # Apply overrides to existing properties
680
924
  _apply_override_to_existing_property(properties[prop], override, overrides_file_path)
681
925
  else:
682
- # Create new property from override
683
- logger.info(f"Creating new property from override: {prop}")
684
- properties[prop] = _create_property_from_override(prop, override, overrides_file_path)
926
+ # Check if property exists by name field (handles cases where key != name)
927
+ existing_property_key = None
928
+ for key, property_data in properties.items():
929
+ if hasattr(property_data, 'get') and property_data.get('name') == prop:
930
+ existing_property_key = key
931
+ break
932
+
933
+ if existing_property_key:
934
+ # Found existing property by name, apply overrides to it
935
+ logger.info(f"Applying override to existing property '{prop}' (found by name, key='{existing_property_key}')")
936
+ _apply_override_to_existing_property(properties[existing_property_key], override, overrides_file_path)
937
+ else:
938
+ # Create new property from override
939
+ logger.info(f"Creating new property from override: {prop}")
940
+ properties[prop] = _create_property_from_override(prop, override, overrides_file_path)
685
941
  return properties
686
942
 
687
943
 
@@ -911,25 +1167,426 @@ def add_config_scope(properties):
911
1167
  return properties
912
1168
 
913
1169
 
1170
+ def map_enum_defaults(properties):
1171
+ """
1172
+ Map enum default values to their user-facing strings using enum_string_mappings.
1173
+
1174
+ This runs after resolve_type_and_default() when enum constraints have been populated.
1175
+ For properties with enum constraints, if the default value is not in the enum list,
1176
+ check if it matches a raw enum value in the type definitions and map it to the
1177
+ user-facing string representation.
1178
+
1179
+ Args:
1180
+ properties (dict): Properties with resolved types and enum constraints
1181
+
1182
+ Returns:
1183
+ dict: Properties with mapped enum default values
1184
+ """
1185
+ global _type_definitions_cache
1186
+
1187
+ if not _type_definitions_cache:
1188
+ return properties
1189
+
1190
+ for prop_name, prop in properties.items():
1191
+ # Skip if not an enum property or no default
1192
+ if not prop.get("enum") or "default" not in prop:
1193
+ continue
1194
+
1195
+ default = prop.get("default")
1196
+ enum_values = prop.get("enum", [])
1197
+
1198
+ # Skip if default is None or already in the enum list
1199
+ if default is None or default in enum_values:
1200
+ continue
1201
+
1202
+ # Check if default is a raw enum value that needs mapping
1203
+ if not isinstance(default, str):
1204
+ continue
1205
+
1206
+ # Search type definitions for matching enum with string mappings
1207
+ for type_name, type_def in _type_definitions_cache.items():
1208
+ if type_def.get("type") != "enum":
1209
+ continue
1210
+
1211
+ mappings = type_def.get("enum_string_mappings")
1212
+ if not mappings:
1213
+ continue
1214
+
1215
+ # If we find a mapping for this default value, apply it
1216
+ if default in mappings:
1217
+ mapped_value = mappings[default]
1218
+ prop["default"] = mapped_value
1219
+ logger.debug(f"✓ Mapped enum default for {prop_name}: {default} → {mapped_value}")
1220
+ break
1221
+
1222
+ return properties
1223
+
1224
+
1225
+ def format_time_human_readable(value, unit):
1226
+ """
1227
+ Convert a numeric time value to a human-readable string.
1228
+
1229
+ Args:
1230
+ value: Numeric value (int)
1231
+ unit: 'ms' for milliseconds, 's' for seconds
1232
+
1233
+ Returns:
1234
+ Human-readable string like "7 days", "1 hour", "30 minutes"
1235
+ """
1236
+ # Convert to milliseconds for uniform handling
1237
+ if unit == 's':
1238
+ ms = value * 1000
1239
+ else:
1240
+ ms = value
1241
+
1242
+ # Time unit thresholds in milliseconds
1243
+ units = [
1244
+ (365 * 24 * 60 * 60 * 1000, 'year', 'years'),
1245
+ (7 * 24 * 60 * 60 * 1000, 'week', 'weeks'),
1246
+ (24 * 60 * 60 * 1000, 'day', 'days'),
1247
+ (60 * 60 * 1000, 'hour', 'hours'),
1248
+ (60 * 1000, 'minute', 'minutes'),
1249
+ (1000, 'second', 'seconds'),
1250
+ (1, 'millisecond', 'milliseconds'),
1251
+ ]
1252
+
1253
+ # Try to find the largest unit that divides evenly
1254
+ for threshold, singular, plural in units:
1255
+ if ms >= threshold and ms % threshold == 0:
1256
+ count = ms // threshold
1257
+ unit_name = singular if count == 1 else plural
1258
+ return f"{int(count)} {unit_name}"
1259
+
1260
+ # If no clean division, return the original with units
1261
+ if unit == 's':
1262
+ return f"{value} seconds"
1263
+ else:
1264
+ return f"{value} milliseconds"
1265
+
1266
+
1267
+ def evaluate_chrono_expressions(properties):
1268
+ """
1269
+ Evaluate chrono expressions in default values and convert to numeric values.
1270
+ Also adds human-readable versions for better UX in templates.
1271
+
1272
+ Examples:
1273
+ - "24h * 365" -> 31536000000 (for milliseconds) + "365 days"
1274
+ - "7 * 24h" -> 604800 (for seconds) + "7 days"
1275
+ - "1h" -> 3600000 (for milliseconds) or 3600 (for seconds) + "1 hour"
1276
+
1277
+ Conversion factors:
1278
+ - ms (milliseconds): 1
1279
+ - s (seconds): 1000 ms
1280
+ - min (minutes): 60000 ms
1281
+ - h (hours): 3600000 ms
1282
+ - d (days): 86400000 ms
1283
+ """
1284
+ import re
1285
+
1286
+ # Conversion factors to milliseconds
1287
+ time_units = {
1288
+ 'ms': 1,
1289
+ 's': 1000,
1290
+ 'min': 60 * 1000,
1291
+ 'h': 60 * 60 * 1000,
1292
+ 'd': 24 * 60 * 60 * 1000,
1293
+ }
1294
+
1295
+ def parse_time_value(expr):
1296
+ """Parse a time expression like '24h' or '365' and return milliseconds."""
1297
+ expr = expr.strip()
1298
+
1299
+ # Try to match number with unit suffix
1300
+ match = re.match(r'^(\d+(?:\.\d+)?)(ms|s|min|h|d)?$', expr)
1301
+ if match:
1302
+ value = float(match.group(1))
1303
+ unit = match.group(2) if match.group(2) else None
1304
+
1305
+ if unit:
1306
+ return value * time_units[unit]
1307
+ else:
1308
+ # Bare number - assume it's already in target units
1309
+ return value
1310
+
1311
+ return None
1312
+
1313
+ def evaluate_expression(expr_str):
1314
+ """Evaluate a simple mathematical expression with time units."""
1315
+ expr_str = expr_str.strip()
1316
+
1317
+ # Handle simple cases first (just a time value)
1318
+ simple_value = parse_time_value(expr_str)
1319
+ if simple_value is not None:
1320
+ return simple_value
1321
+
1322
+ # Handle expressions like "24h * 365" or "7 * 24h"
1323
+ # Replace time values with their millisecond equivalents
1324
+ tokens = re.split(r'(\s*[*/+\-]\s*)', expr_str)
1325
+ evaluated_tokens = []
1326
+
1327
+ for token in tokens:
1328
+ token = token.strip()
1329
+ if not token:
1330
+ continue
1331
+
1332
+ # Check if it's an operator
1333
+ if token in ['*', '/', '+', '-']:
1334
+ evaluated_tokens.append(token)
1335
+ else:
1336
+ # Try to parse as time value
1337
+ value = parse_time_value(token)
1338
+ if value is not None:
1339
+ evaluated_tokens.append(str(value))
1340
+ else:
1341
+ # Not a time value, keep as is
1342
+ evaluated_tokens.append(token)
1343
+
1344
+ # Evaluate the expression
1345
+ try:
1346
+ result = eval(' '.join(evaluated_tokens))
1347
+ return result
1348
+ except:
1349
+ return None
1350
+
1351
+ converted_count = 0
1352
+
1353
+ for prop_name, prop in properties.items():
1354
+ default = prop.get('default')
1355
+ c_type = prop.get('c_type', '')
1356
+
1357
+ # Only process string defaults with chrono types
1358
+ if not isinstance(default, str):
1359
+ continue
1360
+
1361
+ # Check if it's a chrono type or looks like a time expression
1362
+ is_chrono = 'chrono' in c_type or 'duration' in c_type.lower()
1363
+ has_time_expr = any(unit in default for unit in ['ms', 's', 'min', 'h', 'd']) or any(op in default for op in ['*', '+', '-', '/'])
1364
+
1365
+ if not (is_chrono or has_time_expr):
1366
+ continue
1367
+
1368
+ # Try to evaluate the expression
1369
+ result_ms = evaluate_expression(default)
1370
+
1371
+ if result_ms is not None:
1372
+ # Convert to appropriate output unit based on type
1373
+ unit = 'ms' # Track which unit we're using for human-readable format
1374
+
1375
+ if 'std::chrono::milliseconds' in c_type:
1376
+ result = int(result_ms)
1377
+ unit = 'ms'
1378
+ elif 'std::chrono::seconds' in c_type:
1379
+ result = int(result_ms / 1000)
1380
+ unit = 's'
1381
+ elif 'std::chrono::minutes' in c_type:
1382
+ result = int(result_ms / 60000)
1383
+ unit = 'min'
1384
+ elif 'std::chrono::hours' in c_type:
1385
+ result = int(result_ms / 3600000)
1386
+ unit = 'h'
1387
+ elif 'duration' in c_type.lower():
1388
+ # Assume milliseconds for generic duration types
1389
+ result = int(result_ms)
1390
+ unit = 'ms'
1391
+ else:
1392
+ # Default to milliseconds
1393
+ result = int(result_ms)
1394
+ unit = 'ms'
1395
+
1396
+ prop['default'] = result
1397
+
1398
+ # Add human-readable version for templates
1399
+ human_readable = format_time_human_readable(result, unit)
1400
+ prop['default_human_readable'] = human_readable
1401
+
1402
+ converted_count += 1
1403
+ logger.debug(f"Evaluated chrono expression for {prop_name}: '{default}' -> {result} ({human_readable})")
1404
+
1405
+ if converted_count > 0:
1406
+ logger.info(f"Evaluated {converted_count} chrono expressions in default values")
1407
+
1408
+ return properties
1409
+
1410
+
1411
+ def resolve_type_with_namespace(type_name, definitions):
1412
+ """
1413
+ Resolve a type name, trying with namespace prefixes if not found directly.
1414
+
1415
+ Args:
1416
+ type_name: Type name to resolve (may be unqualified)
1417
+ definitions: Dictionary of type definitions
1418
+
1419
+ Returns:
1420
+ The definition dict if found, or {} if not found
1421
+ """
1422
+ # Try the type name as-is first
1423
+ if type_name in definitions:
1424
+ return definitions[type_name]
1425
+
1426
+ # Try common namespace prefixes
1427
+ common_namespaces = ['config', 'model', 'security', 'net', 'kafka', 'pandaproxy']
1428
+ for namespace in common_namespaces:
1429
+ qualified_name = f"{namespace}::{type_name}"
1430
+ if qualified_name in definitions:
1431
+ return definitions[qualified_name]
1432
+
1433
+ # Not found
1434
+ return {}
1435
+
1436
+
914
1437
  def resolve_type_and_default(properties, definitions):
915
1438
  """
916
1439
  Normalize property types and expand C++-style default values into JSON-compatible Python structures.
917
-
918
- This function resolves type references in each property against the provided definitions (supports "$ref" and direct type names), normalizes property "type" to a JSON Schema primitive when possible, expands C++ constructor/initializer and common C++ literal patterns found in "default" values into Python primitives/objects/lists, ensures array-typed properties have array defaults (including handling one_or_many_property cases), updates array item type information when item types reference definitions, and converts any `enterprise_value` strings via process_enterprise_value.
919
-
1440
+
1441
+ ============================================================================
1442
+ TYPE RESOLUTION SYSTEM - How C++ Types Become JSON Schema Types
1443
+ ============================================================================
1444
+
1445
+ This function bridges C++ type system with JSON Schema by:
1446
+ 1. Resolving definition references ($ref pointers) to actual type structures
1447
+ 2. Expanding C++ constructors into JSON-compatible default values
1448
+ 3. Ensuring type consistency between properties and their defaults
1449
+ 4. Handling special array/optional type patterns from Redpanda source
1450
+
1451
+ TYPE RESOLUTION FLOW:
1452
+ ┌─────────────────────────────────────────────────────────────────────────
1453
+ │ C++ Source:
1454
+ │ property<model::broker_endpoint> admin(...,
1455
+ │ model::broker_endpoint(net::unresolved_address("127.0.0.1", 9644))
1456
+ │ )
1457
+
1458
+ │ ↓ TypeTransformer (transformers.py)
1459
+ │ type: "broker_endpoint" (extracted from template parameter)
1460
+ │ default: "model::broker_endpoint(net::unresolved_address(\"127.0.0.1\", 9644))"
1461
+
1462
+ │ ↓ Definition Lookup (definitions dict)
1463
+ │ definitions["broker_endpoint"] = {
1464
+ │ "type": "object",
1465
+ │ "properties": {"address": {"type": "string"}, "port": {"type": "integer"}}
1466
+ │ }
1467
+
1468
+ │ ↓ Constructor Expansion (this function)
1469
+ │ type: "object" (resolved from definition)
1470
+ │ default: {"address": "127.0.0.1", "port": 9644} (expanded constructor)
1471
+
1472
+ │ ↓ JSON Output:
1473
+ │ "admin": {
1474
+ │ "type": "object",
1475
+ │ "properties": {...},
1476
+ │ "default": {"address": "127.0.0.1", "port": 9644}
1477
+ │ }
1478
+ └─────────────────────────────────────────────────────────────────────────
1479
+
1480
+ DEFINITION SYSTEM - Reusable Type Structures:
1481
+ ┌─────────────────────────────────────────────────────────────────────────
1482
+ │ Purpose: Definitions centralize complex type information to avoid
1483
+ │ repeating structure across multiple properties.
1484
+
1485
+ │ Source: definitions.json contains hand-crafted JSON Schema definitions
1486
+ │ for Redpanda's C++ types (endpoints, durations, enums, etc.)
1487
+
1488
+ │ Usage in Properties:
1489
+ │ Before resolution: type: "broker_endpoint"
1490
+ │ After resolution: type: "object" + properties from definition
1491
+
1492
+ │ $ref Pointers:
1493
+ │ Some definitions use JSON Schema $ref to reference other definitions:
1494
+ │ {"$ref": "#/definitions/compression"} → resolve recursively
1495
+
1496
+ │ Definition Structure:
1497
+ │ {
1498
+ │ "compression": {
1499
+ │ "type": "string",
1500
+ │ "enum": ["gzip", "snappy", "lz4", "zstd", "none"]
1501
+ │ },
1502
+ │ "broker_endpoint": {
1503
+ │ "type": "object",
1504
+ │ "properties": {
1505
+ │ "address": {"type": "string"},
1506
+ │ "port": {"type": "integer"}
1507
+ │ }
1508
+ │ }
1509
+ │ }
1510
+ └─────────────────────────────────────────────────────────────────────────
1511
+
1512
+ CONSTRUCTOR EXPANSION - C++ to JSON Conversion:
1513
+ ┌─────────────────────────────────────────────────────────────────────────
1514
+ │ SIMPLE PRIMITIVES:
1515
+ │ C++: 9092 → JSON: 9092
1516
+ │ C++: "localhost" → JSON: "localhost"
1517
+ │ C++: true → JSON: true
1518
+
1519
+ │ ENUM VALUES:
1520
+ │ C++: model::compression::gzip → JSON: "gzip"
1521
+ │ Pattern: namespace::type::value → Extract final value
1522
+
1523
+ │ CONSTRUCTORS:
1524
+ │ C++: net::unresolved_address("127.0.0.1", 9644)
1525
+ │ → Parse: type=unresolved_address, args=["127.0.0.1", 9644]
1526
+ │ → Lookup definition for "unresolved_address"
1527
+ │ → Match args to definition properties by position
1528
+ │ → Result: {"address": "127.0.0.1", "port": 9644}
1529
+
1530
+ │ ARRAYS:
1531
+ │ C++: std::vector<int>{1, 2, 3} → JSON: [1, 2, 3]
1532
+ │ C++: {1, 2, 3} → JSON: [1, 2, 3]
1533
+ │ Special: one_or_many_property wraps single values in arrays
1534
+
1535
+ │ CHRONO DURATIONS:
1536
+ │ C++: std::chrono::seconds{30} → JSON: 30 (with units in description)
1537
+ │ C++: std::chrono::milliseconds{5000} → JSON: 5000
1538
+
1539
+ │ OPTIONAL TYPES:
1540
+ │ C++: std::optional<int>{} → JSON: null
1541
+ │ C++: std::optional<int>{42} → JSON: 42
1542
+ └─────────────────────────────────────────────────────────────────────────
1543
+
1544
+ SPECIAL HANDLING:
1545
+ ┌─────────────────────────────────────────────────────────────────────────
1546
+ │ one_or_many_property<T>:
1547
+ │ - Always treated as array type in JSON
1548
+ │ - Single default values are wrapped: {x:1} → [{x:1}]
1549
+ │ - Already-array defaults preserved: [{x:1}] → [{x:1}]
1550
+
1551
+ │ Array Items Type Resolution:
1552
+ │ - If items.type references a definition, resolve it:
1553
+ │ items.type: "endpoint_tls_config" → items: {...definition...}
1554
+ │ - Ensures array item validation has full type information
1555
+
1556
+ │ Enterprise Values:
1557
+ │ - enterprise_value strings expanded via process_enterprise_value()
1558
+ │ - Converts license restriction patterns to user-friendly strings
1559
+ └─────────────────────────────────────────────────────────────────────────
1560
+
1561
+ HOW TO ADD NEW TYPE DEFINITIONS:
1562
+ 1. Identify the C++ type that needs a definition (e.g., new_endpoint_type)
1563
+ 2. Analyze the C++ struct/class to determine JSON schema structure
1564
+ 3. Add entry to definitions.json with appropriate JSON Schema:
1565
+ {
1566
+ "new_endpoint_type": {
1567
+ "type": "object",
1568
+ "properties": {"field1": {"type": "string"}, "field2": {"type": "integer"}}
1569
+ }
1570
+ }
1571
+ 4. TypeTransformer will automatically extract the type name from C++
1572
+ 5. This function will look up the definition and expand constructors
1573
+ 6. Test with a property using the new type to verify expansion
1574
+
920
1575
  Parameters:
921
- properties (dict): Mapping of property names to metadata dictionaries. Relevant keys that may be modified include "type", "default", "items", and "enterprise_value".
922
- definitions (dict): Mapping of definition names to JSON Schema definition dictionaries used to resolve $ref targets and to infer shapes for expanding constructor-style defaults.
923
-
1576
+ properties (dict): Property name metadata dict with keys: "type", "default",
1577
+ "items", "enterprise_value" that will be modified in-place
1578
+ definitions (dict): Type name → JSON Schema definition used for lookups
1579
+ and constructor expansion
1580
+
924
1581
  Returns:
925
- dict: The same `properties` mapping after in-place normalization and expansion of types, defaults, item types, and enterprise values.
1582
+ dict: The same `properties` dict after in-place type normalization and
1583
+ default value expansion
926
1584
  """
927
1585
  import ast
928
1586
  import re
929
1587
 
930
1588
  def resolve_definition_type(defn):
931
1589
  """Recursively resolve $ref pointers to get the actual type definition."""
932
- # Recursively resolve $ref
933
1590
  while isinstance(defn, dict) and "$ref" in defn:
934
1591
  ref = defn["$ref"]
935
1592
  ref_name = ref.split("/")[-1]
@@ -954,18 +1611,15 @@ def resolve_type_and_default(properties, definitions):
954
1611
  original_s = s
955
1612
  if s.startswith("{") and s.endswith("}"):
956
1613
  s = s[1:-1].strip()
957
-
958
- # Try parentheses syntax first: type_name(args)
959
- match = re.match(r'([a-zA-Z0-9_:]+)\((.*)\)', s)
1614
+
1615
+ match = CONSTRUCTOR_PATTERN.match(s)
960
1616
  if match:
961
1617
  type_name, arg_str = match.groups()
962
1618
  else:
963
- # Try curly brace syntax: type_name{args}
964
- match = re.match(r'([a-zA-Z0-9_:]+)\{(.*)\}', s)
1619
+ match = BRACED_CONSTRUCTOR_PATTERN.match(s)
965
1620
  if match:
966
1621
  type_name, arg_str = match.groups()
967
1622
  else:
968
- # Primitive or enum
969
1623
  if s.startswith('"') and s.endswith('"'):
970
1624
  return None, [ast.literal_eval(s)]
971
1625
  try:
@@ -997,7 +1651,7 @@ def resolve_type_and_default(properties, definitions):
997
1651
  def process_cpp_patterns(arg_str):
998
1652
  """
999
1653
  Convert a C++-style expression string into a JSON-friendly literal representation.
1000
-
1654
+
1001
1655
  This function recognises common C++ patterns produced by the extractor and maps them to values suitable for JSON schema defaults and examples. Handled cases include:
1002
1656
  - std::nullopt -> null
1003
1657
  - zero-argument functions (e.g., model::kafka_audit_logging_topic()) resolved from source when possible
@@ -1005,30 +1659,27 @@ def resolve_type_and_default(properties, definitions):
1005
1659
  - constexpr identifiers and simple string constructors resolved to their literal strings when available
1006
1660
  - known default constructors and truncated type names mapped to sensible defaults (e.g., duration -> 0, path -> "")
1007
1661
  - simple heuristics for unknown constructors and concatenated expressions
1008
-
1662
+
1009
1663
  Returns:
1010
1664
  processed (str): A string representing the JSON-ready value (for example: '"value"', 'null', '0', or the original input when no mapping applied).
1011
1665
  """
1012
1666
  arg_str = arg_str.strip()
1013
1667
  # Remove C++ digit separators (apostrophes) that may appear in numeric literals
1014
- # Example: "30'000ms" -> "30000ms". Use conservative replace only between digits.
1015
- arg_str = re.sub(r"(?<=\d)'(?=\d)", '', arg_str)
1016
-
1017
- # Handle std::nullopt -> null
1018
- if arg_str == "std::nullopt":
1668
+ # Example: "30'000ms" -> "30000ms"
1669
+ arg_str = DIGIT_SEPARATOR_PATTERN.sub('', arg_str)
1670
+
1671
+ if arg_str == "std::nullopt" or arg_str == "nullopt":
1019
1672
  return "null"
1020
-
1021
- # Handle C++ function calls that return constant values
1022
- # Dynamically look up function return values from the source code
1023
- function_call_match = re.match(r'([a-zA-Z0-9_:]+)\(\)', arg_str)
1673
+
1674
+ # Dynamically resolve C++ function calls by looking up their return values in source
1675
+ function_call_match = FUNCTION_CALL_PATTERN.match(arg_str)
1024
1676
  if function_call_match:
1025
1677
  function_name = function_call_match.group(1)
1026
1678
  resolved_value = resolve_cpp_function_call(function_name)
1027
1679
  if resolved_value is not None:
1028
1680
  return f'"{resolved_value}"'
1029
1681
 
1030
- # Handle std::chrono literals like std::chrono::minutes{5} -> "5min"
1031
- chrono_match = re.match(r'std::chrono::([a-zA-Z]+)\s*\{\s*(\d+)\s*\}', arg_str)
1682
+ chrono_match = CHRONO_PATTERN.match(arg_str)
1032
1683
  if chrono_match:
1033
1684
  unit = chrono_match.group(1)
1034
1685
  value = chrono_match.group(2)
@@ -1043,64 +1694,130 @@ def resolve_type_and_default(properties, definitions):
1043
1694
  short = unit_map.get(unit.lower(), unit)
1044
1695
  return f'"{value} {short}"'
1045
1696
 
1046
- # Handle enum-like patterns (such as fips_mode_flag::disabled -> "disabled").
1047
- # Only treat bare 'X::Y' tokens as enums — do not match when the token
1048
- # is followed by constructor braces/parentheses (e.g. std::chrono::minutes{5}).
1049
- enum_match = re.match(r'[a-zA-Z0-9_:]+::([a-zA-Z0-9_]+)\s*$', arg_str)
1697
+ # Handle chrono literals with parentheses like chrono::milliseconds(5min) -> "5 minutes"
1698
+ chrono_paren_match = CHRONO_PAREN_PATTERN.match(arg_str)
1699
+ if chrono_paren_match:
1700
+ unit = chrono_paren_match.group(1)
1701
+ value = chrono_paren_match.group(2).strip()
1702
+
1703
+ inner_time_match = TIME_UNIT_PATTERN.match(value)
1704
+ if inner_time_match:
1705
+ num, suffix = inner_time_match.groups()
1706
+ inner_unit_map = {
1707
+ "min": "minute",
1708
+ "s": "second",
1709
+ "ms": "millisecond",
1710
+ "h": "hour",
1711
+ }
1712
+ base = inner_unit_map.get(suffix, suffix)
1713
+ if num != "1" and not base.endswith("s"):
1714
+ base = base + "s"
1715
+ return f'"{num} {base}"'
1716
+
1717
+ # Evaluate arithmetic in duration constructors (e.g., "60 * 5" -> "300 seconds")
1718
+ if "*" in value:
1719
+ try:
1720
+ result = safe_arithmetic_eval(value)
1721
+ unit_map = {
1722
+ 'hours': 'hour',
1723
+ 'minutes': 'minute',
1724
+ 'seconds': 'second',
1725
+ 'milliseconds': 'millisecond',
1726
+ 'microseconds': 'microsecond',
1727
+ 'nanoseconds': 'nanosecond'
1728
+ }
1729
+ base = unit_map.get(unit.lower(), unit)
1730
+ if result != 1 and not base.endswith("s"):
1731
+ base = base + "s"
1732
+ return f'"{result} {base}"'
1733
+ except (ValueError, Exception):
1734
+ pass
1735
+
1736
+ try:
1737
+ num = int(value)
1738
+ unit_map = {
1739
+ 'hours': 'hour',
1740
+ 'minutes': 'minute',
1741
+ 'seconds': 'second',
1742
+ 'milliseconds': 'millisecond',
1743
+ 'microseconds': 'microsecond',
1744
+ 'nanoseconds': 'nanosecond'
1745
+ }
1746
+ base = unit_map.get(unit.lower(), unit)
1747
+ if num != 1 and not base.endswith("s"):
1748
+ base = base + "s"
1749
+ return f'"{num} {base}"'
1750
+ except ValueError:
1751
+ return f'"{value} {unit}"'
1752
+
1753
+ address_match = ADDRESS_PATTERN.match(arg_str)
1754
+ if address_match:
1755
+ addr = address_match.group(1).strip().strip('"')
1756
+ port = address_match.group(2).strip()
1757
+ try:
1758
+ port_val = int(port)
1759
+ return f'"{addr}:{port_val}"'
1760
+ except ValueError:
1761
+ return f'"{addr}:{port}"'
1762
+
1763
+ keyval_match = KEYVAL_PATTERN.match(arg_str)
1764
+ if keyval_match:
1765
+ key = keyval_match.group(1)
1766
+ value = keyval_match.group(2)
1767
+ processed_value = process_cpp_patterns(value)
1768
+ if processed_value.startswith('"') and processed_value.endswith('"'):
1769
+ processed_value = processed_value[1:-1]
1770
+ return processed_value
1771
+
1772
+ # Extract enum value from qualified identifiers (fips_mode_flag::disabled -> "disabled")
1773
+ # ENUM_PATTERN uses anchors to avoid matching constructor syntax (config::type{})
1774
+ enum_match = ENUM_PATTERN.match(arg_str)
1050
1775
  if enum_match:
1051
1776
  enum_value = enum_match.group(1)
1052
1777
  return f'"{enum_value}"'
1053
-
1054
- # Handle constexpr identifier resolution (such as scram -> "SCRAM")
1055
- # Check if this is a simple identifier that might be a constexpr variable
1056
- if re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', arg_str):
1778
+
1779
+ # Resolve constexpr identifiers by looking up their values in source files
1780
+ if IDENTIFIER_PATTERN.match(arg_str):
1057
1781
  resolved_value = resolve_constexpr_identifier(arg_str)
1058
1782
  if resolved_value is not None:
1059
1783
  return f'"{resolved_value}"'
1060
-
1061
- # Handle string constructor patterns like ss::sstring{identifier}
1062
- sstring_match = re.match(r'ss::sstring\{([a-zA-Z_][a-zA-Z0-9_]*)\}', arg_str)
1784
+
1785
+ sstring_match = SSTRING_PATTERN.match(arg_str)
1063
1786
  if sstring_match:
1064
1787
  identifier = sstring_match.group(1)
1065
1788
  resolved_value = resolve_constexpr_identifier(identifier)
1066
1789
  if resolved_value is not None:
1067
1790
  return f'"{resolved_value}"'
1068
1791
  else:
1069
- # Fallback to the identifier itself
1070
1792
  return f'"{identifier}"'
1071
-
1072
- # Handle default constructors and their default values
1073
- # This handles cases where C++ default constructors are used but should map to specific values
1074
-
1075
- # Pattern 1: Full constructor syntax like config::leaders_preference{}
1793
+
1794
+ # Map C++ default constructors to their runtime values
1795
+ # These patterns are derived from analyzing the C++ source implementations
1076
1796
  constructor_patterns = {
1077
- r'config::leaders_preference\{\}': '"none"', # Based on C++ code analysis
1797
+ r'config::leaders_preference\{\}': '"none"', # type_t::none is default
1078
1798
  r'std::chrono::seconds\{0\}': '0',
1079
1799
  r'std::chrono::milliseconds\{0\}': '0',
1080
1800
  r'model::timeout_clock::duration\{\}': '0',
1081
1801
  r'config::data_directory_path\{\}': '""',
1082
- r'std::optional<[^>]+>\{\}': 'null', # Empty optional
1802
+ r'std::optional<[^>]+>\{\}': 'null',
1083
1803
  }
1084
-
1804
+
1085
1805
  for pattern, replacement in constructor_patterns.items():
1086
1806
  if re.match(pattern, arg_str):
1087
1807
  return replacement
1088
-
1089
- # Pattern 2: Truncated type names that likely came from default constructors
1090
- # These are cases where tree-sitter parsing truncated "config::type{}" to just "type"
1808
+
1809
+ # Fallback mappings for truncated type names (tree-sitter may truncate constructors)
1091
1810
  truncated_patterns = {
1092
- 'leaders_preference': '"none"', # config::leaders_preference{} -> none
1093
- 'data_directory_path': '""', # config::data_directory_path{} -> empty string
1094
- 'timeout_clock_duration': '0', # model::timeout_clock::duration{} -> 0
1095
- 'log_level': '"info"', # Default log level
1096
- 'compression_type': '"none"', # Default compression
1811
+ 'leaders_preference': '"none"',
1812
+ 'data_directory_path': '""',
1813
+ 'timeout_clock_duration': '0',
1814
+ 'log_level': '"info"',
1815
+ 'compression_type': '"none"',
1097
1816
  }
1098
-
1099
- # Check if arg_str is exactly one of these truncated patterns
1817
+
1100
1818
  if arg_str in truncated_patterns:
1101
1819
  return truncated_patterns[arg_str]
1102
-
1103
- # Pattern 3: Handle remaining default constructor syntax generically
1820
+
1104
1821
  generic_constructor_match = re.match(r'[a-zA-Z0-9_:]+\{\}', arg_str)
1105
1822
  if generic_constructor_match:
1106
1823
  # For unknown constructors, try to infer a reasonable default
@@ -1169,8 +1886,12 @@ def resolve_type_and_default(properties, definitions):
1169
1886
  else:
1170
1887
  return processed_arg
1171
1888
 
1172
- type_def = resolve_definition_type(definitions.get(type_name, {}))
1889
+ type_def = resolve_definition_type(resolve_type_with_namespace(type_name, definitions))
1173
1890
  if "enum" in type_def:
1891
+ # Strip C++ namespace qualifiers from enum values
1892
+ # e.g., model::partition_autobalancing_mode::continuous → continuous
1893
+ if isinstance(default_str, str) and '::' in default_str:
1894
+ return default_str.split('::')[-1]
1174
1895
  return default_str
1175
1896
  # If it has properties but no explicit type, it's an object
1176
1897
  if type_def.get("type") == "object" or (type_def.get("properties") and not type_def.get("type")):
@@ -1180,10 +1901,12 @@ def resolve_type_and_default(properties, definitions):
1180
1901
 
1181
1902
  props = list(type_def["properties"].keys())
1182
1903
  result = {}
1183
-
1904
+
1184
1905
  # For each constructor argument, try to expand it and map to the correct property
1185
1906
  for i, prop in enumerate(props):
1186
1907
  prop_def = type_def["properties"][prop]
1908
+ # Strip leading underscore from private field names for public API
1909
+ public_prop_name = prop.lstrip('_')
1187
1910
  if "$ref" in prop_def:
1188
1911
  sub_type = prop_def["$ref"].split("/")[-1]
1189
1912
  else:
@@ -1199,33 +1922,35 @@ def resolve_type_and_default(properties, definitions):
1199
1922
  # Get the definition for the nested type
1200
1923
  nested_type_def = resolve_definition_type(definitions.get(nested_tname, {}))
1201
1924
  nested_props = list(nested_type_def.get("properties", {}).keys())
1202
-
1925
+
1203
1926
  # Expand the nested constructor by mapping its arguments to its properties
1204
1927
  nested_result = {}
1205
1928
  for j, nested_prop in enumerate(nested_props):
1206
1929
  nested_prop_def = nested_type_def["properties"][nested_prop]
1930
+ # Strip leading underscore from private field names for public API
1931
+ public_nested_prop_name = nested_prop.lstrip('_')
1207
1932
  if j < len(nested_args):
1208
1933
  nested_arg = nested_args[j]
1209
1934
  # Apply simple C++ pattern processing to the argument
1210
1935
  processed_nested_arg = process_cpp_patterns(nested_arg)
1211
-
1936
+
1212
1937
  # Convert the processed argument based on the property type
1213
1938
  if nested_prop_def.get("type") == "string":
1214
1939
  if processed_nested_arg.startswith('"') and processed_nested_arg.endswith('"'):
1215
- nested_result[nested_prop] = ast.literal_eval(processed_nested_arg)
1940
+ nested_result[public_nested_prop_name] = ast.literal_eval(processed_nested_arg)
1216
1941
  else:
1217
- nested_result[nested_prop] = processed_nested_arg
1942
+ nested_result[public_nested_prop_name] = processed_nested_arg
1218
1943
  elif nested_prop_def.get("type") == "integer":
1219
1944
  try:
1220
- nested_result[nested_prop] = int(processed_nested_arg)
1945
+ nested_result[public_nested_prop_name] = int(processed_nested_arg)
1221
1946
  except ValueError:
1222
- nested_result[nested_prop] = processed_nested_arg
1947
+ nested_result[public_nested_prop_name] = processed_nested_arg
1223
1948
  elif nested_prop_def.get("type") == "boolean":
1224
- nested_result[nested_prop] = processed_nested_arg.lower() == "true"
1949
+ nested_result[public_nested_prop_name] = processed_nested_arg.lower() == "true"
1225
1950
  else:
1226
- nested_result[nested_prop] = processed_nested_arg
1951
+ nested_result[public_nested_prop_name] = processed_nested_arg
1227
1952
  else:
1228
- nested_result[nested_prop] = None
1953
+ nested_result[public_nested_prop_name] = None
1229
1954
 
1230
1955
  # Now we have the expanded nested object, we need to map it to the parent object's properties
1231
1956
  # This is where the type-aware mapping happens
@@ -1244,39 +1969,40 @@ def resolve_type_and_default(properties, definitions):
1244
1969
  result.update(nested_result)
1245
1970
  # Set remaining properties to None
1246
1971
  for remaining_prop in props[i+1:]:
1247
- if remaining_prop not in result:
1248
- result[remaining_prop] = None
1972
+ public_remaining_prop = remaining_prop.lstrip('_')
1973
+ if public_remaining_prop not in result:
1974
+ result[public_remaining_prop] = None
1249
1975
  break
1250
1976
  else:
1251
1977
  # Map the nested object to the current property
1252
- result[prop] = nested_result
1978
+ result[public_prop_name] = nested_result
1253
1979
  else:
1254
1980
  # Fallback: recursively expand with the expected property type
1255
1981
  expanded_arg = expand_default(sub_type, arg)
1256
- result[prop] = expanded_arg
1982
+ result[public_prop_name] = expanded_arg
1257
1983
  else:
1258
1984
  # Simple value, parse based on the property type
1259
1985
  # First apply C++ pattern processing
1260
1986
  processed_arg = process_cpp_patterns(arg)
1261
-
1987
+
1262
1988
  if sub_type == "string":
1263
1989
  # If processed_arg is already quoted, use ast.literal_eval, otherwise keep as is
1264
1990
  if processed_arg.startswith('"') and processed_arg.endswith('"'):
1265
- result[prop] = ast.literal_eval(processed_arg)
1991
+ result[public_prop_name] = ast.literal_eval(processed_arg)
1266
1992
  else:
1267
- result[prop] = processed_arg
1993
+ result[public_prop_name] = processed_arg
1268
1994
  elif sub_type == "integer":
1269
1995
  try:
1270
- result[prop] = int(processed_arg)
1996
+ result[public_prop_name] = int(processed_arg)
1271
1997
  except ValueError:
1272
1998
  # If conversion fails, keep as string (might be processed C++ pattern)
1273
- result[prop] = processed_arg
1999
+ result[public_prop_name] = processed_arg
1274
2000
  elif sub_type == "boolean":
1275
- result[prop] = processed_arg.lower() == "true"
2001
+ result[public_prop_name] = processed_arg.lower() == "true"
1276
2002
  else:
1277
- result[prop] = processed_arg
2003
+ result[public_prop_name] = processed_arg
1278
2004
  else:
1279
- result[prop] = None
2005
+ result[public_prop_name] = None
1280
2006
  return result
1281
2007
  elif type_def.get("type") == "array":
1282
2008
  # Handle array defaults with C++ initializer list syntax like {model::broker_endpoint(...)}
@@ -1361,24 +2087,45 @@ def resolve_type_and_default(properties, definitions):
1361
2087
  for prop in properties.values():
1362
2088
  t = prop.get("type")
1363
2089
  ref_name = None
1364
-
1365
- # Handle both JSON pointer references and direct type names
2090
+
2091
+
2092
+ # Handle both JSON pointer references and direct type names (including C++ types)
1366
2093
  if isinstance(t, str):
1367
2094
  if t.startswith("#/definitions/"):
1368
2095
  ref_name = t.split("/")[-1]
1369
- elif t in definitions:
1370
- ref_name = t
1371
-
1372
- if ref_name and ref_name in definitions:
1373
- defn = definitions.get(ref_name)
2096
+ else:
2097
+ # Try to resolve the type with namespace prefixes
2098
+ resolved_def = resolve_type_with_namespace(t, definitions)
2099
+ if resolved_def:
2100
+ # Find the actual key name that matched
2101
+ if t in definitions:
2102
+ ref_name = t
2103
+ else:
2104
+ # Try namespace-qualified versions
2105
+ for namespace in ['config', 'model', 'security', 'net', 'kafka', 'pandaproxy']:
2106
+ qualified = f"{namespace}::{t}"
2107
+ if qualified in definitions:
2108
+ ref_name = qualified
2109
+ break
2110
+
2111
+ if ref_name:
2112
+ defn = resolve_type_with_namespace(ref_name, definitions) if ref_name not in definitions else definitions.get(ref_name)
1374
2113
  if defn:
1375
2114
  resolved = resolve_definition_type(defn)
1376
2115
  # Always set type to the resolved type string (object, string, etc.)
1377
2116
  resolved_type = resolved.get("type")
1378
- if resolved_type in ("object", "string", "integer", "boolean", "array", "number"):
2117
+
2118
+ # Special handling for enum types
2119
+ if resolved_type == "enum" or "enum" in resolved:
2120
+ # Enums are represented as strings with an enum constraint in JSON Schema
2121
+ prop["type"] = "string"
2122
+ if "enum" in resolved:
2123
+ prop["enum"] = resolved["enum"]
2124
+ elif resolved_type in ("object", "string", "integer", "boolean", "array", "number"):
1379
2125
  prop["type"] = resolved_type
1380
2126
  else:
1381
2127
  prop["type"] = "object" # fallback for complex types
2128
+
1382
2129
  # Expand default if possible
1383
2130
  if "default" in prop and prop["default"] is not None:
1384
2131
  expanded = expand_default(ref_name, prop["default"])
@@ -1494,24 +2241,70 @@ def resolve_type_and_default(properties, definitions):
1494
2241
  # This handles cases like admin_api_tls: "{}" -> []
1495
2242
  prop["default"] = []
1496
2243
 
1497
- # Also handle array item types
1498
- if prop.get("type") == "array" and "items" in prop:
2244
+ # Also handle array item types - resolve C++ type references
2245
+ # Note: Check for 'items' field regardless of type, since some transformers may overwrite
2246
+ # the type from "array" to "object" while leaving the items field behind
2247
+ if "items" in prop:
1499
2248
  items_type = prop["items"].get("type")
1500
- if isinstance(items_type, str) and items_type in definitions:
1501
- item_defn = definitions.get(items_type)
1502
- if item_defn:
1503
- resolved_item = resolve_definition_type(item_defn)
1504
- resolved_item_type = resolved_item.get("type")
1505
- if resolved_item_type in ("object", "string", "integer", "boolean", "array", "number"):
1506
- prop["items"]["type"] = resolved_item_type
2249
+ if isinstance(items_type, str):
2250
+ # Check if items_type is a C++ type that needs resolution
2251
+ if items_type in definitions:
2252
+ item_defn = definitions.get(items_type)
2253
+ if item_defn:
2254
+ resolved_item = resolve_definition_type(item_defn)
2255
+ resolved_item_type = resolved_item.get("type")
2256
+ if resolved_item_type in ("object", "string", "integer", "boolean", "array", "number"):
2257
+ prop["items"]["type"] = resolved_item_type
2258
+ else:
2259
+ prop["items"]["type"] = "object" # fallback for complex types
2260
+ # If not in definitions but looks like a C++ type, apply fallback logic
2261
+ elif "::" in items_type or items_type.endswith(">") or items_type.endswith("_t") or items_type.startswith("std::"):
2262
+ # Apply same heuristics as for unresolved property types
2263
+ if any(word in items_type.lower() for word in ["int", "long", "short", "double", "float", "number", "_id"]):
2264
+ prop["items"]["type"] = "integer"
2265
+ elif any(word in items_type.lower() for word in ["bool"]):
2266
+ prop["items"]["type"] = "boolean"
2267
+ elif any(word in items_type.lower() for word in ["string", "str", "path", "url", "name"]):
2268
+ prop["items"]["type"] = "string"
1507
2269
  else:
1508
- prop["items"]["type"] = "object" # fallback for complex types
2270
+ # Default to object for complex types (config::*, model::*, etc.)
2271
+ prop["items"]["type"] = "object"
2272
+ logger.debug(f"Resolved C++ type in items: {items_type} -> {prop['items']['type']} (for property '{prop.get('name', 'unknown')}')")
1509
2273
 
1510
2274
  # Final pass: apply C++ pattern processing to any remaining unprocessed defaults
1511
2275
  for prop in properties.values():
1512
2276
  if "default" in prop:
1513
2277
  default_value = prop["default"]
1514
2278
 
2279
+ # Special handling for arrays containing key-value patterns like "'key': 'value'"
2280
+ if isinstance(default_value, list) and len(default_value) > 0:
2281
+ # Check if this looks like an array of key-value patterns
2282
+ all_keyval_patterns = True
2283
+ for item in default_value:
2284
+ if not isinstance(item, str) or not re.match(r"'[^']+'\s*:\s*'[^']+'", item):
2285
+ all_keyval_patterns = False
2286
+ break
2287
+
2288
+ if all_keyval_patterns:
2289
+ # Convert array of key-value strings to a single object
2290
+ result_object = {}
2291
+ for item in default_value:
2292
+ keyval_match = re.match(r"'([^']+)'\s*:\s*'([^']+)'", item)
2293
+ if keyval_match:
2294
+ key = keyval_match.group(1)
2295
+ value = keyval_match.group(2)
2296
+ # Process the value part
2297
+ processed_value = process_cpp_patterns(value)
2298
+ if processed_value.startswith('"') and processed_value.endswith('"'):
2299
+ processed_value = processed_value[1:-1] # Remove outer quotes
2300
+ result_object[key] = processed_value
2301
+
2302
+ # Convert the array type to object since we're now storing an object
2303
+ prop["default"] = result_object
2304
+ if prop.get("type") == "array":
2305
+ prop["type"] = "object"
2306
+ continue # Skip further processing for this property
2307
+
1515
2308
  if isinstance(default_value, str):
1516
2309
  # Process string defaults
1517
2310
  processed = process_cpp_patterns(default_value)
@@ -1544,27 +2337,29 @@ def resolve_type_and_default(properties, definitions):
1544
2337
  # Map constructor arguments to type properties
1545
2338
  for j, nested_prop in enumerate(nested_props):
1546
2339
  nested_prop_def = nested_type_def["properties"][nested_prop]
2340
+ # Strip leading underscore from private field names for public API
2341
+ public_nested_prop_name = nested_prop.lstrip('_')
1547
2342
  if j < len(args):
1548
2343
  nested_arg = args[j]
1549
2344
  processed_nested_arg = process_cpp_patterns(nested_arg)
1550
-
2345
+
1551
2346
  # Convert based on property type
1552
2347
  if nested_prop_def.get("type") == "string":
1553
2348
  if processed_nested_arg.startswith('"') and processed_nested_arg.endswith('"'):
1554
- nested_result[nested_prop] = ast.literal_eval(processed_nested_arg)
2349
+ nested_result[public_nested_prop_name] = ast.literal_eval(processed_nested_arg)
1555
2350
  else:
1556
- nested_result[nested_prop] = processed_nested_arg
2351
+ nested_result[public_nested_prop_name] = processed_nested_arg
1557
2352
  elif nested_prop_def.get("type") == "integer":
1558
2353
  try:
1559
- nested_result[nested_prop] = int(processed_nested_arg)
2354
+ nested_result[public_nested_prop_name] = int(processed_nested_arg)
1560
2355
  except ValueError:
1561
- nested_result[nested_prop] = processed_nested_arg
2356
+ nested_result[public_nested_prop_name] = processed_nested_arg
1562
2357
  elif nested_prop_def.get("type") == "boolean":
1563
- nested_result[nested_prop] = processed_nested_arg.lower() == "true"
2358
+ nested_result[public_nested_prop_name] = processed_nested_arg.lower() == "true"
1564
2359
  else:
1565
- nested_result[nested_prop] = processed_nested_arg
2360
+ nested_result[public_nested_prop_name] = processed_nested_arg
1566
2361
  else:
1567
- nested_result[nested_prop] = None
2362
+ nested_result[public_nested_prop_name] = None
1568
2363
 
1569
2364
  # For special case of net::unresolved_address inside broker_authn_endpoint
1570
2365
  if tname == "net::unresolved_address":
@@ -1655,7 +2450,93 @@ def resolve_type_and_default(properties, definitions):
1655
2450
  if isinstance(enterprise_value, str):
1656
2451
  processed_enterprise = process_enterprise_value(enterprise_value)
1657
2452
  prop["enterprise_value"] = processed_enterprise
1658
-
2453
+
2454
+ # FINAL COMPREHENSIVE PASS: Ensure NO C++ types remain in the output
2455
+ # This catches any edge cases that earlier passes missed
2456
+ for prop_name, prop in properties.items():
2457
+ # Check property type field
2458
+ if isinstance(prop.get("type"), str) and ("::" in prop["type"] or prop["type"].endswith(">")):
2459
+ logger.warning(f"Found unresolved C++ type in property '{prop_name}': {prop['type']}")
2460
+ # Apply smart fallback resolution
2461
+ cpp_type = prop["type"]
2462
+ if any(word in cpp_type.lower() for word in ["int", "long", "short", "double", "float", "number", "_id"]):
2463
+ prop["type"] = "integer"
2464
+ elif any(word in cpp_type.lower() for word in ["bool"]):
2465
+ prop["type"] = "boolean"
2466
+ elif any(word in cpp_type.lower() for word in ["string", "str", "path", "url", "name"]):
2467
+ prop["type"] = "string"
2468
+ else:
2469
+ # Default to object for complex types (config::*, model::*, etc.)
2470
+ prop["type"] = "object"
2471
+ logger.info(f" Resolved to: {prop['type']}")
2472
+
2473
+ # Check items.type field for arrays
2474
+ if prop.get("type") == "array" and "items" in prop:
2475
+ items_type = prop["items"].get("type")
2476
+ if isinstance(items_type, str) and ("::" in items_type or items_type.endswith(">")):
2477
+ logger.warning(f"Found unresolved C++ type in property '{prop_name}' items: {items_type}")
2478
+ # Apply smart fallback resolution
2479
+ if any(word in items_type.lower() for word in ["int", "long", "short", "double", "float", "number", "_id"]):
2480
+ prop["items"]["type"] = "integer"
2481
+ elif any(word in items_type.lower() for word in ["bool"]):
2482
+ prop["items"]["type"] = "boolean"
2483
+ elif any(word in items_type.lower() for word in ["string", "str", "path", "url", "name"]):
2484
+ prop["items"]["type"] = "string"
2485
+ else:
2486
+ # Default to object for complex types (config::*, model::*, etc.)
2487
+ prop["items"]["type"] = "object"
2488
+ logger.info(f" Resolved to: {prop['items']['type']}")
2489
+
2490
+ # Check items.$ref field
2491
+ # Only warn if it's NOT a JSON pointer (valid JSON pointers start with #/)
2492
+ items_ref = prop["items"].get("$ref")
2493
+ if isinstance(items_ref, str) and "::" in items_ref:
2494
+ if items_ref.startswith("#/definitions/"):
2495
+ # This is a valid JSON pointer - extract and resolve the definition name
2496
+ ref_type = items_ref.split("/")[-1]
2497
+ if ref_type in definitions:
2498
+ resolved = resolve_definition_type(definitions[ref_type])
2499
+ resolved_type = resolved.get("type", "object")
2500
+ prop["items"]["type"] = resolved_type
2501
+ del prop["items"]["$ref"]
2502
+ logger.debug(f"Resolved items.$ref '{items_ref}' to '{resolved_type}' for property '{prop_name}'")
2503
+ else:
2504
+ logger.warning(f"Cannot resolve items.$ref '{items_ref}' - definition not found for property '{prop_name}'")
2505
+ else:
2506
+ # Raw C++ type name (not a JSON pointer) - this is an error
2507
+ logger.warning(f"Found raw C++ type in property '{prop_name}' items.$ref: {items_ref}")
2508
+ if items_ref in definitions:
2509
+ resolved = resolve_definition_type(definitions[items_ref])
2510
+ resolved_type = resolved.get("type", "object")
2511
+ prop["items"]["type"] = resolved_type
2512
+ del prop["items"]["$ref"]
2513
+ logger.info(f" Resolved to: {prop['items']['type']}")
2514
+
2515
+ # Check $ref field at property level
2516
+ # Only warn if it's NOT a JSON pointer
2517
+ prop_ref = prop.get("$ref")
2518
+ if isinstance(prop_ref, str) and "::" in prop_ref:
2519
+ if prop_ref.startswith("#/definitions/"):
2520
+ # This is a valid JSON pointer - extract and resolve the definition name
2521
+ ref_type = prop_ref.split("/")[-1]
2522
+ if ref_type in definitions:
2523
+ resolved = resolve_definition_type(definitions[ref_type])
2524
+ resolved_type = resolved.get("type", "object")
2525
+ prop["type"] = resolved_type
2526
+ del prop["$ref"]
2527
+ logger.debug(f"Resolved $ref '{prop_ref}' to '{resolved_type}' for property '{prop_name}'")
2528
+ else:
2529
+ logger.warning(f"Cannot resolve $ref '{prop_ref}' - definition not found for property '{prop_name}'")
2530
+ else:
2531
+ # Raw C++ type name (not a JSON pointer) - this is an error
2532
+ logger.warning(f"Found raw C++ type in property '{prop_name}' $ref: {prop_ref}")
2533
+ if prop_ref in definitions:
2534
+ resolved = resolve_definition_type(definitions[prop_ref])
2535
+ resolved_type = resolved.get("type", "object")
2536
+ prop["type"] = resolved_type
2537
+ del prop["$ref"]
2538
+ logger.info(f" Resolved to: {prop['type']}")
2539
+
1659
2540
  return properties
1660
2541
 
1661
2542
 
@@ -1733,7 +2614,7 @@ def extract_topic_properties(source_path):
1733
2614
  "description": prop_data.get("description", ""),
1734
2615
  "type": prop_data.get("type", "string"),
1735
2616
  "config_scope": "topic",
1736
- "source_file": prop_data.get("source_file", ""),
2617
+ "defined_in": prop_data.get("defined_in", ""),
1737
2618
  "corresponding_cluster_property": prop_data.get("corresponding_cluster_property", ""),
1738
2619
  "acceptable_values": prop_data.get("acceptable_values", ""),
1739
2620
  "is_deprecated": False,
@@ -1754,7 +2635,8 @@ def main():
1754
2635
  CLI entry point that extracts Redpanda configuration properties from C++ sources and emits JSON outputs.
1755
2636
 
1756
2637
  Runs a full extraction and transformation pipeline:
1757
- - Parses command-line options (required: --path). Optional flags include --recursive, --output, --enhanced-output, --definitions, --overrides, --cloud-support, and --verbose.
2638
+ - Parses command-line options (required: --path). Optional flags include --recursive, --output, --enhanced-output, --overrides, --cloud-support, and --verbose.
2639
+ - The --overrides file can contain both property overrides (under "properties" key) and definition overrides (under "definitions" key).
1758
2640
  - Validates input paths and collects header/.cc file pairs.
1759
2641
  - Initializes Tree-sitter C++ parser and extracts configuration properties from source files (optionally augmented with topic properties).
1760
2642
  - Produces two outputs:
@@ -1767,7 +2649,9 @@ def main():
1767
2649
  Side effects:
1768
2650
  - Reads and writes files, may call external cloud config fetchers, logs to the configured logger, and may call sys.exit() on fatal conditions.
1769
2651
  """
2652
+ global _type_definitions_cache
1770
2653
  import argparse
2654
+ from pathlib import Path
1771
2655
 
1772
2656
  def generate_options():
1773
2657
  """
@@ -1778,8 +2662,12 @@ def main():
1778
2662
  - --recursive: scan the path recursively.
1779
2663
  - --output: file path to write the JSON output (stdout if omitted).
1780
2664
  - --enhanced-output: file path to write the enhanced JSON output with overrides applied.
1781
- - --definitions: JSON file containing type definitions (defaults to a definitions.json co-located with this module).
1782
- - --overrides: optional JSON file with property description/metadata overrides.
2665
+ - --overrides: optional JSON file with property and definition overrides. Structure:
2666
+ {
2667
+ "properties": { "property_name": { "description": "...", ... } },
2668
+ "definitions": { "type_name": { "type": "object", "properties": {...} } }
2669
+ }
2670
+ - --definitions: DEPRECATED - use overrides.json with "definitions" key instead.
1783
2671
  - --cloud-support: enable fetching cloud metadata from the cloudv2 repository (requires GITHUB_TOKEN and external dependencies such as pyyaml and requests).
1784
2672
  - -v / --verbose: enable verbose (DEBUG-level) logging.
1785
2673
 
@@ -1798,10 +2686,16 @@ def main():
1798
2686
  arg_parser.add_argument("--enhanced-output", type=str, help="Enhanced JSON output file path")
1799
2687
 
1800
2688
  # Data sources
1801
- arg_parser.add_argument("--definitions", type=str,
1802
- default=os.path.dirname(os.path.realpath(__file__)) + "/definitions.json",
1803
- help="Type definitions JSON file")
1804
- arg_parser.add_argument("--overrides", type=str, help="Property overrides JSON file")
2689
+ arg_parser.add_argument(
2690
+ "--definitions",
2691
+ type=str,
2692
+ help="DEPRECATED: Type definitions JSON file (use --overrides with 'definitions' key instead)"
2693
+ )
2694
+ arg_parser.add_argument(
2695
+ "--overrides",
2696
+ type=str,
2697
+ help="JSON file with property and definition overrides. Format: {'properties': {...}, 'definitions': {...}}"
2698
+ )
1805
2699
 
1806
2700
  # Feature flags (set by Makefile from environment variables)
1807
2701
  arg_parser.add_argument("--cloud-support", action="store_true", help="Enable cloud metadata")
@@ -1827,26 +2721,71 @@ def main():
1827
2721
  logging.error("No h/cc file pairs were found")
1828
2722
  sys.exit(-1)
1829
2723
 
1830
- definitions = None
2724
+ # DYNAMIC TYPE DEFINITION EXTRACTION
2725
+ # Automatically extract type definitions from C++ source code
2726
+ # This replaces the need for manually maintaining definitions.json
2727
+ logger.info("🔍 Extracting type definitions from C++ source code...")
1831
2728
 
1832
- if options.definitions:
1833
- try:
1834
- with open(options.definitions) as json_file:
1835
- definitions = json.load(json_file)
1836
- except json.JSONDecodeError as e:
1837
- logging.error(f"Failed to parse definitions file: {e}")
1838
- sys.exit(1)
2729
+ from type_definition_extractor import extract_definitions_from_source
2730
+
2731
+ try:
2732
+ # Extract definitions from the parent 'v' directory to get all subdirectories
2733
+ # (model, config, net, etc.) since types may be defined in different modules
2734
+ source_root = Path(options.path)
2735
+
2736
+ # If path points to repo root, go down to src/v
2737
+ if (source_root / 'src' / 'v').exists():
2738
+ source_root = source_root / 'src' / 'v'
2739
+ # If path points to a specific subdirectory, go up to the parent 'v' directory
2740
+ elif source_root.name in ('config', 'model', 'net', 'kafka', 'pandaproxy', 'security'):
2741
+ source_root = source_root.parent
2742
+
2743
+ logger.debug(f"Extracting type definitions from: {source_root}")
2744
+ definitions = extract_definitions_from_source(str(source_root))
2745
+ logger.info(f"✅ Extracted {len(definitions)} type definitions dynamically")
2746
+
2747
+ # Store definitions in global cache for transformers to access
2748
+ _type_definitions_cache = definitions
2749
+ except Exception as e:
2750
+ logger.warning(f"Failed to extract dynamic definitions: {e}")
2751
+ definitions = {}
1839
2752
 
1840
- # Load property overrides if provided
2753
+ # Load overrides file (contains both property and definition overrides)
1841
2754
  overrides = None
1842
2755
  if options.overrides:
1843
2756
  try:
1844
2757
  with open(options.overrides) as f:
1845
2758
  overrides = json.load(f)
2759
+
2760
+ # Load definition overrides from the overrides file
2761
+ if overrides and "definitions" in overrides:
2762
+ definition_overrides = overrides["definitions"]
2763
+ num_overrides = len(definition_overrides)
2764
+ definitions.update(definition_overrides)
2765
+ _type_definitions_cache = definitions
2766
+ logger.info(f"📝 Loaded {num_overrides} definition overrides from {options.overrides}")
1846
2767
  except Exception as e:
1847
2768
  logging.error(f"Failed to load overrides file: {e}")
1848
2769
  sys.exit(1)
1849
2770
 
2771
+ # DEPRECATED: Support legacy --definitions flag for backward compatibility
2772
+ # Users should migrate to putting definitions in overrides.json under "definitions" key
2773
+ if options.definitions and os.path.exists(options.definitions):
2774
+ try:
2775
+ logger.warning("⚠️ --definitions flag is deprecated. Please move definitions to overrides.json under 'definitions' key")
2776
+ with open(options.definitions) as json_file:
2777
+ static_definitions = json.load(json_file)
2778
+
2779
+ # Merge: static overrides take precedence
2780
+ num_overrides = len(static_definitions)
2781
+ definitions.update(static_definitions)
2782
+ _type_definitions_cache = definitions
2783
+
2784
+ logger.info(f"📝 Loaded {num_overrides} legacy definition overrides from {options.definitions}")
2785
+ except json.JSONDecodeError as e:
2786
+ logging.error(f"Failed to parse definitions file: {e}")
2787
+ sys.exit(1)
2788
+
1850
2789
  treesitter_dir = os.path.join(os.getcwd(), "tree-sitter/tree-sitter-cpp")
1851
2790
  destination_path = os.path.join(treesitter_dir, "tree-sitter-cpp.so")
1852
2791
 
@@ -1858,6 +2797,11 @@ def main():
1858
2797
  treesitter_dir, destination_path
1859
2798
  )
1860
2799
 
2800
+ # Pre-build constexpr cache for performance
2801
+ # This avoids repeated filesystem walks when resolving C++ identifiers and function calls
2802
+ logger.info("🔧 Building constexpr identifier cache...")
2803
+ _constexpr_cache.build_cache(options.path)
2804
+ logger.info(f"✅ Cached {len(_constexpr_cache.constexpr_cache)} constexpr identifiers and {len(_constexpr_cache.function_cache)} functions")
1861
2805
 
1862
2806
  files_with_properties = get_files_with_properties(
1863
2807
  file_pairs, treesitter_parser, cpp_language
@@ -1867,9 +2811,28 @@ def main():
1867
2811
  # Extract topic properties and add them to the main properties dictionary
1868
2812
  topic_properties = extract_topic_properties(options.path)
1869
2813
  if topic_properties:
2814
+ # Apply transformers to topic properties to ensure they get the same metadata as cluster properties
2815
+ topic_properties = apply_transformers_to_topic_properties(topic_properties)
1870
2816
  properties.update(topic_properties)
1871
2817
  logging.info(f"Added {len(topic_properties)} topic properties to the main properties collection")
1872
2818
 
2819
+ # Fix up corresponding_cluster_property mappings
2820
+ # Some cluster properties have a "_default" suffix that the extractor doesn't catch
2821
+ fixup_count = 0
2822
+ for prop_name, prop_data in properties.items():
2823
+ if prop_data.get('is_topic_property') and prop_data.get('corresponding_cluster_property'):
2824
+ cluster_prop = prop_data['corresponding_cluster_property']
2825
+ # Check if the mapped cluster property exists
2826
+ if cluster_prop not in properties:
2827
+ # Try the _default variant
2828
+ default_variant = f'{cluster_prop}_default'
2829
+ if default_variant in properties:
2830
+ prop_data['corresponding_cluster_property'] = default_variant
2831
+ fixup_count += 1
2832
+
2833
+ if fixup_count > 0:
2834
+ logging.info(f"Fixed {fixup_count} cluster property mappings by adding '_default' suffix")
2835
+
1873
2836
  # First, create the original properties without overrides for the base JSON output
1874
2837
  # 1. Add config_scope field based on which source file defines the property
1875
2838
  original_properties = add_config_scope(deepcopy(properties))
@@ -1893,10 +2856,22 @@ def main():
1893
2856
 
1894
2857
  # 3. Resolve type references and expand default values for original properties
1895
2858
  original_properties = resolve_type_and_default(original_properties, definitions)
1896
-
2859
+
2860
+ # 4. Map enum default values to user-facing strings
2861
+ original_properties = map_enum_defaults(original_properties)
2862
+
2863
+ # 5. Evaluate chrono expressions in default values
2864
+ original_properties = evaluate_chrono_expressions(original_properties)
2865
+
2866
+ # 6. Filter definitions to only include referenced types (reduces bloat)
2867
+ filtered_definitions = filter_referenced_definitions(original_properties, definitions)
2868
+
2869
+ # 6. Clean private fields from definitions (keep JSON output clean)
2870
+ filtered_definitions = clean_private_fields_from_definitions(filtered_definitions)
2871
+
1897
2872
  # Generate original properties JSON (without overrides)
1898
2873
  original_properties_and_definitions = merge_properties_and_definitions(
1899
- original_properties, definitions
2874
+ original_properties, filtered_definitions
1900
2875
  )
1901
2876
  original_json_output = json.dumps(original_properties_and_definitions, indent=4, sort_keys=True)
1902
2877
 
@@ -1914,14 +2889,26 @@ def main():
1914
2889
 
1915
2890
  # 4. Resolve type references and expand default values
1916
2891
  # This step converts:
1917
- # - C++ type names (model::broker_endpoint) to JSON schema types (object)
2892
+ # - C++ type names (model::broker_endpoint) to JSON schema types (object)
1918
2893
  # - C++ constructor defaults to structured JSON objects
1919
2894
  # - Single object defaults to arrays for one_or_many_property types
1920
2895
  enhanced_properties = resolve_type_and_default(enhanced_properties, definitions)
1921
2896
 
2897
+ # 5. Map enum default values to user-facing strings
2898
+ enhanced_properties = map_enum_defaults(enhanced_properties)
2899
+
2900
+ # 6. Evaluate chrono expressions in default values
2901
+ enhanced_properties = evaluate_chrono_expressions(enhanced_properties)
2902
+
2903
+ # 7. Filter definitions to only include referenced types (reduces bloat)
2904
+ filtered_enhanced_definitions = filter_referenced_definitions(enhanced_properties, definitions)
2905
+
2906
+ # 7. Clean private fields from definitions (keep JSON output clean)
2907
+ filtered_enhanced_definitions = clean_private_fields_from_definitions(filtered_enhanced_definitions)
2908
+
1922
2909
  # Generate enhanced properties JSON (with overrides)
1923
2910
  enhanced_properties_and_definitions = merge_properties_and_definitions(
1924
- enhanced_properties, definitions
2911
+ enhanced_properties, filtered_enhanced_definitions
1925
2912
  )
1926
2913
  enhanced_json_output = json.dumps(enhanced_properties_and_definitions, indent=4, sort_keys=True)
1927
2914