regscale-cli 6.19.1.0__py3-none-any.whl → 6.20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of regscale-cli might be problematic. Click here for more details.

Files changed (36) hide show
  1. regscale/__init__.py +1 -1
  2. regscale/airflow/config.py +2 -0
  3. regscale/airflow/tasks/groups.py +11 -47
  4. regscale/core/app/internal/login.py +49 -43
  5. regscale/core/app/internal/model_editor.py +2 -1
  6. regscale/dev/code_gen.py +2 -5
  7. regscale/integrations/commercial/amazon/common.py +5 -4
  8. regscale/integrations/commercial/aws/scanner.py +3 -2
  9. regscale/integrations/commercial/synqly/assets.py +20 -0
  10. regscale/integrations/commercial/synqly/ticketing.py +25 -0
  11. regscale/integrations/commercial/wizv2/click.py +3 -3
  12. regscale/integrations/public/fedramp/appendix_parser.py +499 -104
  13. regscale/integrations/public/fedramp/fedramp_five.py +89 -43
  14. regscale/integrations/scanner_integration.py +1 -1
  15. regscale/models/app_models/import_validater.py +2 -0
  16. regscale/models/integration_models/cisa_kev_data.json +355 -27
  17. regscale/models/integration_models/flat_file_importer/__init__.py +26 -9
  18. regscale/models/integration_models/synqly_models/capabilities.json +1 -1
  19. regscale/models/regscale_models/__init__.py +5 -0
  20. regscale/models/regscale_models/business_impact_assessment.py +71 -0
  21. regscale/models/regscale_models/control_implementation.py +15 -0
  22. regscale/models/regscale_models/master_assessment.py +19 -0
  23. regscale/models/regscale_models/policy.py +90 -0
  24. regscale/models/regscale_models/question.py +30 -2
  25. regscale/models/regscale_models/questionnaire.py +4 -3
  26. regscale/models/regscale_models/questionnaire_instance.py +37 -14
  27. regscale/models/regscale_models/rbac.py +0 -1
  28. regscale/models/regscale_models/regscale_model.py +16 -15
  29. regscale/models/regscale_models/risk_trend.py +67 -0
  30. regscale/utils/graphql_client.py +2 -1
  31. {regscale_cli-6.19.1.0.dist-info → regscale_cli-6.20.0.0.dist-info}/METADATA +130 -71
  32. {regscale_cli-6.19.1.0.dist-info → regscale_cli-6.20.0.0.dist-info}/RECORD +36 -33
  33. {regscale_cli-6.19.1.0.dist-info → regscale_cli-6.20.0.0.dist-info}/LICENSE +0 -0
  34. {regscale_cli-6.19.1.0.dist-info → regscale_cli-6.20.0.0.dist-info}/WHEEL +0 -0
  35. {regscale_cli-6.19.1.0.dist-info → regscale_cli-6.20.0.0.dist-info}/entry_points.txt +0 -0
  36. {regscale_cli-6.19.1.0.dist-info → regscale_cli-6.20.0.0.dist-info}/top_level.txt +0 -0
@@ -44,7 +44,30 @@ ORIGINATIONS = [
44
44
  ]
45
45
  LOWER_ORIGINATIONS = [origin.lower() for origin in ORIGINATIONS]
46
46
  DEFAULT_ORIGINATION = "Service Provider Corporate"
47
- POSITIVE_KEYWORDS = ["yes", "true", "1", "☒", "True", "Yes", "☑", "☑️"]
47
+ POSITIVE_KEYWORDS = [
48
+ "yes",
49
+ "true",
50
+ "1",
51
+ "☒",
52
+ "True",
53
+ "Yes",
54
+ "☑",
55
+ "☑️",
56
+ "✓",
57
+ "✔",
58
+ "✔️",
59
+ "✅",
60
+ "⬜",
61
+ "▣",
62
+ "■",
63
+ "□",
64
+ "⊠",
65
+ "⊗",
66
+ "×",
67
+ "checked",
68
+ "selected",
69
+ "chosen",
70
+ ]
48
71
 
49
72
  # Define your keywords or phrases that map to each status
50
73
  STATUS_KEYWORDS = {
@@ -125,23 +148,135 @@ class AppendixAParser:
125
148
 
126
149
  @staticmethod
127
150
  def determine_origination(text: str) -> Optional[str]:
151
+ """
152
+ Determine the origination from the text. Multiple originations may be found and
153
+ returned as a comma-separated string.
154
+
155
+ :param str text: The text to analyze for origination values
156
+ :return: Comma-separated string of origination values or None if none found
157
+ :rtype: Optional[str]
158
+ """
159
+ if CONTROL_ORIGIN_KEY not in text:
160
+ return None
161
+
162
+ # Clean and standardize the text for processing
163
+ lower_text = AppendixAParser._clean_text_for_processing(text)
164
+
165
+ # Find all matching originations
166
+ found_originations = AppendixAParser._find_originations_in_text(lower_text)
167
+
168
+ if found_originations:
169
+ return ",".join(found_originations)
170
+ return None
171
+
172
+ @staticmethod
173
+ def _clean_text_for_processing(text: str) -> str:
174
+ """
175
+ Clean and standardize text for processing.
176
+
177
+ :param str text: The text to clean
178
+ :return: Cleaned and standardized text
179
+ :rtype: str
180
+ """
128
181
  tokens = text.split()
129
182
  rejoined_text = " ".join(tokens) # this removes any newlines or spaces
130
183
  rejoined_text = rejoined_text.replace("( ", "(")
131
184
  rejoined_text = rejoined_text.replace(" )", ")")
185
+ return rejoined_text.lower()
186
+
187
+ @staticmethod
188
+ def _find_originations_in_text(lower_text: str) -> List[str]:
189
+ """
190
+ Find all originations in the text.
191
+
192
+ :param str lower_text: The lowercase text to search for originations
193
+ :return: List of found originations
194
+ :rtype: List[str]
195
+ """
196
+ # Common checkbox characters in various fonts and styles
197
+ checkbox_chars = ["☒", "☑", "☑️", "✓", "✔", "✔️", "✅", "⬜", "▣", "■", "□", "⊠", "⊗", "×"]
198
+
199
+ found_originations = []
132
200
 
133
- if CONTROL_ORIGIN_KEY not in text:
134
- return None
135
201
  for origin in ORIGINATIONS:
136
- for keyword in POSITIVE_KEYWORDS:
137
- valid_option = f"{keyword} {origin}".lower()
138
- lower_text = rejoined_text.lower()
139
- if valid_option in lower_text:
140
- return origin # Return the first matching status
141
- return None
202
+ if AppendixAParser._check_origin_with_keywords(origin, lower_text):
203
+ found_originations.append(origin)
204
+ continue
205
+
206
+ if AppendixAParser._check_origin_with_checkbox_chars(origin, lower_text, checkbox_chars):
207
+ found_originations.append(origin)
208
+ continue
209
+
210
+ if AppendixAParser._check_origin_with_text_patterns(origin, lower_text):
211
+ found_originations.append(origin)
212
+
213
+ return found_originations
214
+
215
+ @staticmethod
216
+ def _check_origin_with_keywords(origin: str, lower_text: str) -> bool:
217
+ """
218
+ Check if origin is indicated with known keywords.
219
+
220
+ :param str origin: The origin to check for
221
+ :param str lower_text: The text to search in
222
+ :return: True if origin is found with keywords, False otherwise
223
+ :rtype: bool
224
+ """
225
+ for keyword in POSITIVE_KEYWORDS:
226
+ # Check with space between checkbox and origin
227
+ valid_option_with_space = f"{keyword} {origin}".lower()
228
+ # Check without space between checkbox and origin
229
+ valid_option_without_space = f"{keyword}{origin}".lower()
230
+
231
+ if valid_option_with_space in lower_text or valid_option_without_space in lower_text:
232
+ return True
233
+ return False
234
+
235
+ @staticmethod
236
+ def _check_origin_with_checkbox_chars(origin: str, lower_text: str, checkbox_chars: List[str]) -> bool:
237
+ """
238
+ Check if origin is indicated with checkbox characters.
239
+
240
+ :param str origin: The origin to check for
241
+ :param str lower_text: The text to search in
242
+ :param List[str] checkbox_chars: List of checkbox characters to check for
243
+ :return: True if origin is found with checkbox characters, False otherwise
244
+ :rtype: bool
245
+ """
246
+ for char in checkbox_chars:
247
+ # Check with and without space
248
+ if f"{char} {origin}".lower() in lower_text or f"{char}{origin}".lower() in lower_text:
249
+ return True
250
+ return False
251
+
252
+ @staticmethod
253
+ def _check_origin_with_text_patterns(origin: str, lower_text: str) -> bool:
254
+ """
255
+ Check if origin is indicated with text patterns.
256
+
257
+ :param str origin: The origin to check for
258
+ :param str lower_text: The text to search in
259
+ :return: True if origin is found with text patterns, False otherwise
260
+ :rtype: bool
261
+ """
262
+ # Look for patterns like "X is checked" or "X is selected"
263
+ check_patterns = [
264
+ f"{origin.lower()} is checked",
265
+ f"{origin.lower()} is selected",
266
+ f"{origin.lower()} (checked)",
267
+ f"{origin.lower()} (selected)",
268
+ f"selected: {origin.lower()}",
269
+ ]
270
+ return any(pattern in lower_text for pattern in check_patterns)
142
271
 
143
272
  @staticmethod
144
273
  def determine_status(text: str) -> str:
274
+ """
275
+ Determine the implementation status from the text.
276
+ :param str text: The text to analyze for implementation status
277
+ :return: The determined implementation status
278
+ :rtype: str
279
+ """
145
280
  # Tokenize the input text
146
281
  tokens = text.split()
147
282
 
@@ -150,23 +285,54 @@ class AppendixAParser:
150
285
 
151
286
  matches = []
152
287
 
288
+ # Common checkbox characters in various fonts and styles
289
+ checkbox_chars = ["☒", "☑", "☑️", "✓", "✔", "✔️", "✅", "⬜", "▣", "■", "□", "⊠", "⊗", "×"]
290
+
153
291
  # Search for keywords in the tokenized text to determine the status
154
292
  for status, keywords in STATUS_KEYWORDS.items():
155
293
  for keyword in keywords:
156
- if f"1 {keyword}" in token_string or f"☒ {keyword}" in token_string:
294
+ # Check patterns with space: "1 keyword" or "☒ keyword" or any other checkbox char
295
+ if f"1 {keyword}" in token_string or any(
296
+ f"{char} {keyword}" in token_string for char in checkbox_chars
297
+ ):
157
298
  matches.append(status)
299
+ break
300
+
301
+ # Check patterns without space: "1keyword" or "☒keyword" or any other checkbox char
302
+ elif f"1{keyword}" in token_string or any(
303
+ f"{char}{keyword}" in token_string for char in checkbox_chars
304
+ ):
305
+ matches.append(status)
306
+ break
307
+
308
+ # Also check for direct True/Yes values next to keywords
309
+ elif any(pos + keyword in token_string for pos in ["true", "yes"]):
310
+ matches.append(status)
311
+ break
158
312
 
159
313
  # Determine the status to return
160
314
  if len(matches) > 1:
161
315
  # More than one match found
162
- # not applicable takes presendence over planned/partially implemented (only 2 valid multi select statuses for fedramp)
163
- if matches[1] == NA_STATUS:
164
- return matches[1]
316
+ # Not applicable takes precedence over planned/partially implemented (only 2 valid multi select statuses for fedramp)
317
+ if NA_STATUS in matches:
318
+ return NA_STATUS
165
319
  else:
166
320
  return matches[0]
167
321
  elif matches:
168
322
  return matches[0] # Return the first match if only one
169
323
  else:
324
+ # Extra fallback for unusual checkbox patterns
325
+ # Look for any checkbox-like character anywhere in the text without keywords
326
+ for status, keywords in STATUS_KEYWORDS.items():
327
+ for keyword in keywords:
328
+ # Skip the checkbox characters themselves (already checked above)
329
+ if keyword in checkbox_chars:
330
+ continue
331
+
332
+ # Check if any checkbox character is present in the text alongside common implementation terms
333
+ if any(char in token_string for char in checkbox_chars) and keyword in token_string:
334
+ return status
335
+
170
336
  return DEFAULT_STATUS # No matches found
171
337
 
172
338
  @staticmethod
@@ -192,28 +358,100 @@ class AppendixAParser:
192
358
  :return: The state of the checkbox.
193
359
  :rtype: bool
194
360
  """
195
- # First, try getting the attribute 'val' directly
361
+ # Try different methods to determine checkbox state
362
+ methods = [
363
+ AppendixAParser._check_direct_val_attribute,
364
+ AppendixAParser._check_checked_element,
365
+ AppendixAParser._check_default_element,
366
+ AppendixAParser._check_child_elements,
367
+ AppendixAParser._check_attributes,
368
+ AppendixAParser._check_namespace_attributes,
369
+ ]
370
+
371
+ for method in methods:
372
+ result = method(checkbox_element)
373
+ if result is not None:
374
+ return result
375
+
376
+ # If none of the methods worked, return False
377
+ return False
378
+
379
+ @staticmethod
380
+ def _check_direct_val_attribute(element: Any) -> Optional[bool]:
381
+ """Check if element has a direct 'val' attribute."""
196
382
  val = "{%s}%s" % (SCHEMA, "val")
197
- checked = "{%s}%s" % (SCHEMA, "checked")
198
- default = "{%s}%s" % (SCHEMA, "default")
199
- state = checkbox_element.get(val)
383
+ state = element.get(val)
200
384
  if state is not None:
201
385
  return state == "1"
386
+ return None
202
387
 
203
- # If not found, look for a child element 'checked' that may contain the 'val' attribute
204
- checked_element = checkbox_element.find(checked)
205
- if checked_element is not None:
206
- state = checked_element.get(val)
207
- return state == "1"
388
+ @staticmethod
389
+ def _check_checked_element(element: Any) -> Optional[bool]:
390
+ """Check if element has a 'checked' child with a 'val' attribute."""
391
+ val = "{%s}%s" % (SCHEMA, "val")
392
+ checked = "{%s}%s" % (SCHEMA, "checked")
393
+ return AppendixAParser._check_element_with_val(element, checked, val)
208
394
 
209
- # If still not found, check for a 'default' state as a fallback
210
- default_element = checkbox_element.find(default)
211
- if default_element is not None:
212
- state = default_element.get(val)
213
- return state == "1"
395
+ @staticmethod
396
+ def _check_default_element(element: Any) -> Optional[bool]:
397
+ """Check if element has a 'default' child with a 'val' attribute."""
398
+ val = "{%s}%s" % (SCHEMA, "val")
399
+ default = "{%s}%s" % (SCHEMA, "default")
400
+ return AppendixAParser._check_element_with_val(element, default, val)
214
401
 
215
- # If there's no indication of the state, return False or handle accordingly
216
- return False
402
+ @staticmethod
403
+ def _check_element_with_val(parent: Any, child_tag: str, val_tag: str) -> Optional[bool]:
404
+ """
405
+ Check if a child element has a 'val' attribute.
406
+
407
+ :param Any parent: The parent element
408
+ :param str child_tag: The child element tag
409
+ :param str val_tag: The value attribute tag
410
+ :return: True if val is "1", False if val is not "1", None if element or val not found
411
+ :rtype: Optional[bool]
412
+ """
413
+ child_element = parent.find(child_tag)
414
+ if child_element is not None:
415
+ state = child_element.get(val_tag)
416
+ if state is not None:
417
+ return state == "1"
418
+ return None
419
+
420
+ @staticmethod
421
+ def _check_child_elements(element: Any) -> Optional[bool]:
422
+ """Check all child elements for a 'val' attribute."""
423
+ val = "{%s}%s" % (SCHEMA, "val")
424
+ try:
425
+ for child in element.getchildren():
426
+ if child.get(val) is not None:
427
+ return child.get(val) == "1"
428
+ except (AttributeError, TypeError):
429
+ pass
430
+ return None
431
+
432
+ @staticmethod
433
+ def _check_attributes(element: Any) -> Optional[bool]:
434
+ """Check all attributes for check-related names."""
435
+ try:
436
+ for attr_name, attr_value in element.attrib.items():
437
+ if "checked" in attr_name.lower() or "val" in attr_name.lower() or "state" in attr_name.lower():
438
+ return attr_value in ["1", "true", "checked", "on"]
439
+ except (AttributeError, TypeError):
440
+ pass
441
+ return None
442
+
443
+ @staticmethod
444
+ def _check_namespace_attributes(element: Any) -> Optional[bool]:
445
+ """Check attributes in all namespaces."""
446
+ try:
447
+ for ns, uri in element.nsmap.items():
448
+ for attr_name in ["val", "checked", "state", "default"]:
449
+ attr_with_ns = "{%s}%s" % (uri, attr_name)
450
+ if element.get(attr_with_ns) is not None:
451
+ return element.get(attr_with_ns) in ["1", "true", "checked", "on"]
452
+ except (AttributeError, TypeError):
453
+ pass
454
+ return None
217
455
 
218
456
  def get_implementation_statuses(self) -> Dict:
219
457
  """
@@ -268,25 +506,58 @@ class AppendixAParser:
268
506
  :param Dict control_dict: The dictionary containing the control implementation data.
269
507
  :param str check: The check string to exclude from the part value.
270
508
  """
509
+ part_list = control_dict.get("parts", [])
510
+
271
511
  if cell_count > 1:
272
- name = self.get_cell_text(cells[0]) if cells[0].text else DEFAULT_PART
273
- value = self.get_cell_text(cells[1])
274
- part_list = control_dict.get("parts", [])
275
- val_dict = {"name": name, "value": value}
276
- if check not in value.lower() and val_dict not in part_list:
277
- part_list.append(val_dict)
278
- control_dict["parts"] = part_list
512
+ self._handle_multicolumn_part(cells, part_list, check)
279
513
  else:
280
- value = self.get_cell_text(cells[0])
281
- value_lower = value.lower()
282
- pattern = re.compile(r"\b(" + "|".join(re.escape(part) for part in self.parts_set) + r")\b", re.IGNORECASE)
283
- match = pattern.search(value_lower)
284
- name = match.group(1) if match else DEFAULT_PART
285
- part_list = control_dict.get("parts", [])
286
- val_dict = {"name": name, "value": value}
287
- if check.lower() not in value_lower and val_dict not in part_list:
288
- part_list.append(val_dict)
289
- control_dict["parts"] = part_list
514
+ self._handle_single_column_part(cells[0], part_list, check)
515
+
516
+ control_dict["parts"] = part_list
517
+
518
+ def _handle_multicolumn_part(self, cells: Any, part_list: List, check: str):
519
+ """
520
+ Handle a part with multiple columns.
521
+
522
+ :param Any cells: The cells in the row.
523
+ :param List part_list: List to add parts to.
524
+ :param str check: The check string to exclude from part value.
525
+ """
526
+ name = self.get_cell_text(cells[0]) if cells[0].text else DEFAULT_PART
527
+ value = self.get_cell_text(cells[1])
528
+ val_dict = {"name": name, "value": value}
529
+ if check not in value.lower() and val_dict not in part_list:
530
+ part_list.append(val_dict)
531
+
532
+ def _handle_single_column_part(self, cell: Any, part_list: List, check: str):
533
+ """
534
+ Handle a part with a single column.
535
+
536
+ :param Any cell: The cell to process.
537
+ :param List part_list: List to add parts to.
538
+ :param str check: The check string to exclude from part value.
539
+ """
540
+ value = self.get_cell_text(cell)
541
+ value_lower = value.lower()
542
+
543
+ # Find part name using regex pattern
544
+ name = self._extract_part_name(value_lower)
545
+
546
+ val_dict = {"name": name, "value": value}
547
+ if check.lower() not in value_lower and val_dict not in part_list:
548
+ part_list.append(val_dict)
549
+
550
+ def _extract_part_name(self, text: str) -> str:
551
+ """
552
+ Extract part name from text using regex.
553
+
554
+ :param str text: The text to extract from.
555
+ :return: The extracted part name or default part name.
556
+ :rtype: str
557
+ """
558
+ pattern = re.compile(r"\b(" + "|".join(re.escape(part) for part in self.parts_set) + r")\b", re.IGNORECASE)
559
+ match = pattern.search(text)
560
+ return match.group(1) if match else DEFAULT_PART
290
561
 
291
562
  def set_cell_text(self, cell: Any):
292
563
  """
@@ -330,6 +601,8 @@ class AppendixAParser:
330
601
  self._handle_implementation_status()
331
602
  self._handle_implementation_origination()
332
603
  self._handle_implementation_statement()
604
+ # Comment out the implementation parts handling as it requires parameters not available in this context
605
+ # We'll rely on the handle_row_parts method to handle parts instead
333
606
  # self._handle_implementation_parts(cell_index, cells)
334
607
  self._handle_responsibility()
335
608
 
@@ -363,23 +636,44 @@ class AppendixAParser:
363
636
  """
364
637
  Handle the origination of the control implementation.
365
638
  """
639
+ origination_values = []
640
+
641
+ # Check if we're in a Control Summary section and have Control Origination text
366
642
  if (
367
- self.cell_data_status
368
- and any(
369
- [self.score_similarity(self.cell_data_status.lower(), origin) > 90 for origin in LOWER_ORIGINATIONS]
370
- )
371
- and CONTROL_SUMMARY_KEY.lower() in self.header_row_text.lower()
643
+ CONTROL_SUMMARY_KEY.lower() in self.header_row_text.lower()
372
644
  and CONTROL_ORIGIN_KEY.lower() in self.joined_processed_texts.lower()
373
- and self.header_row_text.split(" ")[0] in self.controls_implementations
645
+ and self.control_id in self.controls_implementations
646
+ and self.controls_implementations[self.control_id] is not None
374
647
  ):
375
- if self.control_id in self.controls_implementations:
648
+ # Method 1: Check cell_data for origination values based on checkbox states
649
+ for key, value in self.cell_data.items():
650
+ if value and any(origin.lower() in key.lower() for origin in ORIGINATIONS):
651
+ # Find the matching origination from the known list
652
+ for origin in ORIGINATIONS:
653
+ if origin.lower() in key.lower():
654
+ logger.debug(f"Found origination from checkbox: {origin}")
655
+ if origin not in origination_values:
656
+ origination_values.append(origin)
657
+ break
658
+
659
+ # Method 2: Try determine_origination as backup
660
+ if orig := self.determine_origination(self.joined_processed_texts):
661
+ logger.debug(f"Found origination from text: {orig}")
662
+ # Handle multiple comma-separated values in the determine_origination result
663
+ for origin in orig.split(","):
664
+ if origin.strip() and origin.strip() not in origination_values:
665
+ origination_values.append(origin.strip())
666
+
667
+ # Save all origination values as comma-delimited string
668
+ if origination_values:
376
669
  control_dict = self.controls_implementations[self.control_id]
377
- control_dict["origination"] = self.cell_data_status
378
- elif origination := self.determine_origination(self.joined_processed_texts):
379
- if origination in ORIGINATIONS:
380
- if self.control_id in self.controls_implementations:
381
- control_dict = self.controls_implementations[self.control_id]
382
- control_dict["origination"] = origination
670
+ control_dict["origination"] = ",".join(origination_values)
671
+ logger.debug(f"Setting origination for {self.control_id}: {control_dict['origination']}")
672
+ elif DEFAULT_ORIGINATION:
673
+ # Set default if none found
674
+ control_dict = self.controls_implementations[self.control_id]
675
+ control_dict["origination"] = DEFAULT_ORIGINATION
676
+ logger.debug(f"Setting default origination for {self.control_id}: {DEFAULT_ORIGINATION}")
383
677
 
384
678
  def _handle_implementation_status(self):
385
679
  """
@@ -439,38 +733,80 @@ class AppendixAParser:
439
733
  """
440
734
  value_check = f"{self.control_id} What is the solution and how is it implemented?"
441
735
  generic_value_check = "What is the solution and how is it implemented".lower()
442
- if (
736
+
737
+ # Skip processing if conditions aren't met
738
+ if not self._should_process_parts(value_check, generic_value_check):
739
+ return
740
+
741
+ part_value = self.joined_processed_texts.strip()
742
+ control_dict = self.controls_implementations.get(self.control_id, {})
743
+ part_list = control_dict.get("parts", [])
744
+
745
+ # Check if this is a part declaration
746
+ if not self._is_part_declaration(part_value):
747
+ return
748
+
749
+ part_name = part_value.strip() or DEFAULT_PART
750
+ part_value = self._combine_part_text(part_name, part_value, cell_index, cells)
751
+
752
+ # Build the part dictionary
753
+ self.build_part_dict(
754
+ part_name=part_name,
755
+ part_value=part_value,
756
+ control_dict=control_dict,
757
+ part_list=part_list,
758
+ generic_value_check=generic_value_check,
759
+ )
760
+
761
+ def _should_process_parts(self, value_check: str, generic_value_check: str) -> bool:
762
+ """
763
+ Determine if parts processing should continue.
764
+
765
+ :param str value_check: Value check string for this specific control
766
+ :param str generic_value_check: Generic value check string
767
+ :return: True if processing should continue, False otherwise
768
+ :rtype: bool
769
+ """
770
+ return (
443
771
  generic_value_check in self.header_row_text.lower()
444
772
  and value_check.lower() != self.joined_processed_texts.lower()
445
773
  and self.control_id in self.controls_implementations
446
- ):
447
- part_value = self.joined_processed_texts.strip()
448
- control_dict = self.controls_implementations.get(self.control_id, {})
449
- part_list = control_dict.get("parts", [])
450
- if any(
451
- [
452
- part_value.strip().lower() == p.lower() or part_value.strip().lower() == f"{p.lower()}:"
453
- for p in self.parts
454
- ]
455
- ):
456
- part_name = part_value.strip() or DEFAULT_PART
457
- next_cell_text = self.get_cell_text(cells[cell_index + 1])
458
- if ":" not in part_value:
459
- part_value = ": ".join(
460
- [
461
- part_value.strip(),
462
- next_cell_text.strip(),
463
- ]
464
- )
465
- else:
466
- part_value = " ".join([part_value.strip(), next_cell_text.strip()])
467
- self.build_part_dict(
468
- part_name=part_name,
469
- part_value=part_value,
470
- control_dict=control_dict,
471
- part_list=part_list,
472
- generic_value_check=generic_value_check,
473
- )
774
+ )
775
+
776
+ def _is_part_declaration(self, part_value: str) -> bool:
777
+ """
778
+ Check if the value is a part declaration.
779
+
780
+ :param str part_value: The value to check
781
+ :return: True if it's a part declaration, False otherwise
782
+ :rtype: bool
783
+ """
784
+ return any(
785
+ [
786
+ part_value.strip().lower() == p.lower() or part_value.strip().lower() == f"{p.lower()}:"
787
+ for p in self.parts
788
+ ]
789
+ )
790
+
791
+ def _combine_part_text(self, part_name: str, part_value: str, cell_index: int, cells: Any) -> str:
792
+ """
793
+ Combine part text from potentially multiple cells.
794
+
795
+ :param str part_name: Name of the part
796
+ :param str part_value: Current value text
797
+ :param int cell_index: Current cell index
798
+ :param Any cells: All cells in the row
799
+ :return: Combined part text
800
+ :rtype: str
801
+ """
802
+ next_cell_text = self.get_cell_text(cells[cell_index + 1])
803
+
804
+ if ":" not in part_value:
805
+ # If part_value doesn't have a colon, add the next cell's text after a colon
806
+ return ": ".join([part_value.strip(), next_cell_text.strip()])
807
+ else:
808
+ # If part_value already has a colon, just add the next cell's text
809
+ return " ".join([part_value.strip(), next_cell_text.strip()])
474
810
 
475
811
  def build_part_dict(
476
812
  self, part_name: str, part_value: str, control_dict: Dict, part_list: List, generic_value_check: str
@@ -484,24 +820,42 @@ class AppendixAParser:
484
820
  :param str generic_value_check: The generic value check string.
485
821
  """
486
822
  if part_value.lower().startswith("part"):
487
- parts = part_value.split(":", 1)
488
- part_dict = {"name": part_name, "value": DEFAULT_PART}
489
- if len(parts) == 2 and parts[1].strip() != "":
490
- part_dict["name"] = parts[0].strip()
491
- part_dict["value"] = parts[1].strip()
492
- logger.debug(f"Part: {part_dict}")
493
- self.add_to_list(new_dict=part_dict, the_list=part_list)
494
- elif part_value.strip() != "" and generic_value_check not in part_value.lower():
495
- part_dict["value"] = part_value.strip()
496
- self.add_to_list(new_dict=part_dict, the_list=part_list)
823
+ self._handle_part_value_starting_with_part(part_name, part_value, part_list, generic_value_check)
497
824
  elif generic_value_check not in part_value.lower():
825
+ # For values that don't start with "part" but are valid
498
826
  pdict = {
499
827
  "name": DEFAULT_PART,
500
828
  "value": part_value.strip(),
501
829
  }
502
830
  self.add_to_list(new_dict=pdict, the_list=part_list)
831
+
503
832
  control_dict["parts"] = part_list
504
833
 
834
+ def _handle_part_value_starting_with_part(
835
+ self, part_name: str, part_value: str, part_list: List, generic_value_check: str
836
+ ):
837
+ """
838
+ Handle part values that start with "part".
839
+
840
+ :param str part_name: The name of the part
841
+ :param str part_value: The value of the part
842
+ :param List part_list: The list to add parts to
843
+ :param str generic_value_check: The generic value check string
844
+ """
845
+ parts = part_value.split(":", 1)
846
+ part_dict = {"name": part_name, "value": DEFAULT_PART}
847
+
848
+ if len(parts) == 2 and parts[1].strip() != "":
849
+ # If part value has a colon and content after it
850
+ part_dict["name"] = parts[0].strip()
851
+ part_dict["value"] = parts[1].strip()
852
+ logger.debug(f"Part: {part_dict}")
853
+ self.add_to_list(new_dict=part_dict, the_list=part_list)
854
+ elif part_value.strip() != "" and generic_value_check not in part_value.lower():
855
+ # If part value has no colon but is not empty and not the generic check
856
+ part_dict["value"] = part_value.strip()
857
+ self.add_to_list(new_dict=part_dict, the_list=part_list)
858
+
505
859
  @staticmethod
506
860
  def add_to_list(new_dict: Dict, the_list: List):
507
861
  """
@@ -530,14 +884,55 @@ class AppendixAParser:
530
884
  """
531
885
  Handle the checkbox states in the DOCX table.
532
886
  """
533
- updated_checkbox_states = [self._get_checkbox_state(state) for state in self.checkbox_states]
534
- for item in self.processed_texts[1:]:
535
- if isinstance(item, dict):
536
- self.cell_data.update(item)
537
- else:
538
- self.cell_data[item.strip()] = updated_checkbox_states.pop(0) if updated_checkbox_states else None
887
+ try:
888
+ # Get checkbox states
889
+ updated_checkbox_states = []
890
+ for checkbox in self.checkbox_states:
891
+ try:
892
+ is_checked = self._get_checkbox_state(checkbox)
893
+ updated_checkbox_states.append(is_checked)
894
+ logger.debug(f"Checkbox state: {is_checked}")
895
+ except Exception as e:
896
+ # If we can't determine the state, assume it's not checked
897
+ logger.debug(f"Error getting checkbox state: {e}")
898
+ updated_checkbox_states.append(False)
899
+
900
+ # Log total checkboxes found
901
+ logger.debug(f"Found {len(updated_checkbox_states)} checkbox states: {updated_checkbox_states}")
902
+
903
+ # First handle any dictionary items in processed_texts
904
+ for item in self.processed_texts:
905
+ if isinstance(item, dict):
906
+ self.cell_data.update(item)
907
+
908
+ # Handle text items with corresponding checkbox states
909
+ text_items = [item for item in self.processed_texts if not isinstance(item, dict)]
910
+
911
+ # Match checkbox states to text items
912
+ for i, item in enumerate(text_items):
913
+ if i < len(updated_checkbox_states):
914
+ self.cell_data[item.strip()] = updated_checkbox_states[i]
915
+ else:
916
+ # If we have more text items than checkbox states, assume unchecked
917
+ self.cell_data[item.strip()] = False
918
+
919
+ # Also check for checkbox character directly in text
920
+ for key in list(self.cell_data.keys()):
921
+ # If text contains a checkbox character and state is False, try to determine true state from text
922
+ if not self.cell_data[key]:
923
+ checkbox_chars = ["☒", "☑", "☑️", "✓", "✔", "✔️", "✅", "⬜", "▣", "■", "□", "⊠", "⊗", "×"]
924
+ if any(char in key for char in checkbox_chars):
925
+ self.cell_data[key] = True
926
+
927
+ # Update cell data status
539
928
  self._get_cell_data_status()
540
929
 
930
+ except Exception as e:
931
+ logger.debug(f"Error in _handle_checkbox_states: {e}")
932
+ # Ensure we don't leave checkbox_states empty
933
+ if not hasattr(self, "cell_data") or self.cell_data is None:
934
+ self.cell_data = {}
935
+
541
936
  def _get_cell_data_status(self):
542
937
  """
543
938
  Get the status of the cell data.