PyPI - regscale-cli - Versions diffs - 6.19.1.0__py3-none-any.whl → 6.20.0.0__py3-none-any.whl - Mend - Supply Chain Defender

regscale-cli 6.19.1.0py3-none-any.whl → 6.20.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of regscale-cli might be problematic. Click here for more details.

Files changed (36) hide show

regscale/integrations/public/fedramp/appendix_parser.py CHANGED Viewed

@@ -44,7 +44,30 @@ ORIGINATIONS = [
 ]
 LOWER_ORIGINATIONS = [origin.lower() for origin in ORIGINATIONS]
 DEFAULT_ORIGINATION = "Service Provider Corporate"
-POSITIVE_KEYWORDS = ["yes", "true", "1", "☒", "True", "Yes", "☑", "☑️"]
+POSITIVE_KEYWORDS = [
+    "yes",
+    "true",
+    "1",
+    "☒",
+    "True",
+    "Yes",
+    "☑",
+    "☑️",
+    "✓",
+    "✔",
+    "✔️",
+    "✅",
+    "⬜",
+    "▣",
+    "■",
+    "□",
+    "⊠",
+    "⊗",
+    "×",
+    "checked",
+    "selected",
+    "chosen",
+]
 # Define your keywords or phrases that map to each status
 STATUS_KEYWORDS = {
@@ -125,23 +148,135 @@ class AppendixAParser:
     @staticmethod
     def determine_origination(text: str) -> Optional[str]:
+        """
+        Determine the origination from the text. Multiple originations may be found and
+        returned as a comma-separated string.
+        :param str text: The text to analyze for origination values
+        :return: Comma-separated string of origination values or None if none found
+        :rtype: Optional[str]
+        """
+        if CONTROL_ORIGIN_KEY not in text:
+            return None
+        # Clean and standardize the text for processing
+        lower_text = AppendixAParser._clean_text_for_processing(text)
+        # Find all matching originations
+        found_originations = AppendixAParser._find_originations_in_text(lower_text)
+        if found_originations:
+            return ",".join(found_originations)
+        return None
+    @staticmethod
+    def _clean_text_for_processing(text: str) -> str:
+        """
+        Clean and standardize text for processing.
+        :param str text: The text to clean
+        :return: Cleaned and standardized text
+        :rtype: str
+        """
         tokens = text.split()
         rejoined_text = " ".join(tokens)  # this removes any newlines or spaces
         rejoined_text = rejoined_text.replace("( ", "(")
         rejoined_text = rejoined_text.replace(" )", ")")
+        return rejoined_text.lower()
+    @staticmethod
+    def _find_originations_in_text(lower_text: str) -> List[str]:
+        """
+        Find all originations in the text.
+        :param str lower_text: The lowercase text to search for originations
+        :return: List of found originations
+        :rtype: List[str]
+        """
+        # Common checkbox characters in various fonts and styles
+        checkbox_chars = ["☒", "☑", "☑️", "✓", "✔", "✔️", "✅", "⬜", "▣", "■", "□", "⊠", "⊗", "×"]
+        found_originations = []
-        if CONTROL_ORIGIN_KEY not in text:
-            return None
         for origin in ORIGINATIONS:
-            for keyword in POSITIVE_KEYWORDS:
-                valid_option = f"{keyword} {origin}".lower()
-                lower_text = rejoined_text.lower()
-                if valid_option in lower_text:
-                    return origin  # Return the first matching status
-        return None
+            if AppendixAParser._check_origin_with_keywords(origin, lower_text):
+                found_originations.append(origin)
+                continue
+            if AppendixAParser._check_origin_with_checkbox_chars(origin, lower_text, checkbox_chars):
+                found_originations.append(origin)
+                continue
+            if AppendixAParser._check_origin_with_text_patterns(origin, lower_text):
+                found_originations.append(origin)
+        return found_originations
+    @staticmethod
+    def _check_origin_with_keywords(origin: str, lower_text: str) -> bool:
+        """
+        Check if origin is indicated with known keywords.
+        :param str origin: The origin to check for
+        :param str lower_text: The text to search in
+        :return: True if origin is found with keywords, False otherwise
+        :rtype: bool
+        """
+        for keyword in POSITIVE_KEYWORDS:
+            # Check with space between checkbox and origin
+            valid_option_with_space = f"{keyword} {origin}".lower()
+            # Check without space between checkbox and origin
+            valid_option_without_space = f"{keyword}{origin}".lower()
+            if valid_option_with_space in lower_text or valid_option_without_space in lower_text:
+                return True
+        return False
+    @staticmethod
+    def _check_origin_with_checkbox_chars(origin: str, lower_text: str, checkbox_chars: List[str]) -> bool:
+        """
+        Check if origin is indicated with checkbox characters.
+        :param str origin: The origin to check for
+        :param str lower_text: The text to search in
+        :param List[str] checkbox_chars: List of checkbox characters to check for
+        :return: True if origin is found with checkbox characters, False otherwise
+        :rtype: bool
+        """
+        for char in checkbox_chars:
+            # Check with and without space
+            if f"{char} {origin}".lower() in lower_text or f"{char}{origin}".lower() in lower_text:
+                return True
+        return False
+    @staticmethod
+    def _check_origin_with_text_patterns(origin: str, lower_text: str) -> bool:
+        """
+        Check if origin is indicated with text patterns.
+        :param str origin: The origin to check for
+        :param str lower_text: The text to search in
+        :return: True if origin is found with text patterns, False otherwise
+        :rtype: bool
+        """
+        # Look for patterns like "X is checked" or "X is selected"
+        check_patterns = [
+            f"{origin.lower()} is checked",
+            f"{origin.lower()} is selected",
+            f"{origin.lower()} (checked)",
+            f"{origin.lower()} (selected)",
+            f"selected: {origin.lower()}",
+        ]
+        return any(pattern in lower_text for pattern in check_patterns)
     @staticmethod
     def determine_status(text: str) -> str:
+        """
+        Determine the implementation status from the text.
+        :param str text: The text to analyze for implementation status
+        :return: The determined implementation status
+        :rtype: str
+        """
         # Tokenize the input text
         tokens = text.split()
@@ -150,23 +285,54 @@ class AppendixAParser:
         matches = []
+        # Common checkbox characters in various fonts and styles
+        checkbox_chars = ["☒", "☑", "☑️", "✓", "✔", "✔️", "✅", "⬜", "▣", "■", "□", "⊠", "⊗", "×"]
         # Search for keywords in the tokenized text to determine the status
         for status, keywords in STATUS_KEYWORDS.items():
             for keyword in keywords:
-                if f"1 {keyword}" in token_string or f"☒ {keyword}" in token_string:
+                # Check patterns with space: "1 keyword" or "☒ keyword" or any other checkbox char
+                if f"1 {keyword}" in token_string or any(
+                    f"{char} {keyword}" in token_string for char in checkbox_chars
+                ):
                     matches.append(status)
+                    break
+                # Check patterns without space: "1keyword" or "☒keyword" or any other checkbox char
+                elif f"1{keyword}" in token_string or any(
+                    f"{char}{keyword}" in token_string for char in checkbox_chars
+                ):
+                    matches.append(status)
+                    break
+                # Also check for direct True/Yes values next to keywords
+                elif any(pos + keyword in token_string for pos in ["true", "yes"]):
+                    matches.append(status)
+                    break
         # Determine the status to return
         if len(matches) > 1:
             # More than one match found
-            # not applicable takes presendence over planned/partially implemented (only 2 valid multi select statuses for fedramp)
-            if matches[1] == NA_STATUS:
-                return matches[1]
+            # Not applicable takes precedence over planned/partially implemented (only 2 valid multi select statuses for fedramp)
+            if NA_STATUS in matches:
+                return NA_STATUS
             else:
                 return matches[0]
         elif matches:
             return matches[0]  # Return the first match if only one
         else:
+            # Extra fallback for unusual checkbox patterns
+            # Look for any checkbox-like character anywhere in the text without keywords
+            for status, keywords in STATUS_KEYWORDS.items():
+                for keyword in keywords:
+                    # Skip the checkbox characters themselves (already checked above)
+                    if keyword in checkbox_chars:
+                        continue
+                    # Check if any checkbox character is present in the text alongside common implementation terms
+                    if any(char in token_string for char in checkbox_chars) and keyword in token_string:
+                        return status
             return DEFAULT_STATUS  # No matches found
     @staticmethod
@@ -192,28 +358,100 @@ class AppendixAParser:
         :return: The state of the checkbox.
         :rtype: bool
         """
-        # First, try getting the attribute 'val' directly
+        # Try different methods to determine checkbox state
+        methods = [
+            AppendixAParser._check_direct_val_attribute,
+            AppendixAParser._check_checked_element,
+            AppendixAParser._check_default_element,
+            AppendixAParser._check_child_elements,
+            AppendixAParser._check_attributes,
+            AppendixAParser._check_namespace_attributes,
+        ]
+        for method in methods:
+            result = method(checkbox_element)
+            if result is not None:
+                return result
+        # If none of the methods worked, return False
+        return False
+    @staticmethod
+    def _check_direct_val_attribute(element: Any) -> Optional[bool]:
+        """Check if element has a direct 'val' attribute."""
         val = "{%s}%s" % (SCHEMA, "val")
-        checked = "{%s}%s" % (SCHEMA, "checked")
-        default = "{%s}%s" % (SCHEMA, "default")
-        state = checkbox_element.get(val)
+        state = element.get(val)
         if state is not None:
             return state == "1"
+        return None
-        # If not found, look for a child element 'checked' that may contain the 'val' attribute
-        checked_element = checkbox_element.find(checked)
-        if checked_element is not None:
-            state = checked_element.get(val)
-            return state == "1"
+    @staticmethod
+    def _check_checked_element(element: Any) -> Optional[bool]:
+        """Check if element has a 'checked' child with a 'val' attribute."""
+        val = "{%s}%s" % (SCHEMA, "val")
+        checked = "{%s}%s" % (SCHEMA, "checked")
+        return AppendixAParser._check_element_with_val(element, checked, val)
-        # If still not found, check for a 'default' state as a fallback
-        default_element = checkbox_element.find(default)
-        if default_element is not None:
-            state = default_element.get(val)
-            return state == "1"
+    @staticmethod
+    def _check_default_element(element: Any) -> Optional[bool]:
+        """Check if element has a 'default' child with a 'val' attribute."""
+        val = "{%s}%s" % (SCHEMA, "val")
+        default = "{%s}%s" % (SCHEMA, "default")
+        return AppendixAParser._check_element_with_val(element, default, val)
-        # If there's no indication of the state, return False or handle accordingly
-        return False
+    @staticmethod
+    def _check_element_with_val(parent: Any, child_tag: str, val_tag: str) -> Optional[bool]:
+        """
+        Check if a child element has a 'val' attribute.
+        :param Any parent: The parent element
+        :param str child_tag: The child element tag
+        :param str val_tag: The value attribute tag
+        :return: True if val is "1", False if val is not "1", None if element or val not found
+        :rtype: Optional[bool]
+        """
+        child_element = parent.find(child_tag)
+        if child_element is not None:
+            state = child_element.get(val_tag)
+            if state is not None:
+                return state == "1"
+        return None
+    @staticmethod
+    def _check_child_elements(element: Any) -> Optional[bool]:
+        """Check all child elements for a 'val' attribute."""
+        val = "{%s}%s" % (SCHEMA, "val")
+        try:
+            for child in element.getchildren():
+                if child.get(val) is not None:
+                    return child.get(val) == "1"
+        except (AttributeError, TypeError):
+            pass
+        return None
+    @staticmethod
+    def _check_attributes(element: Any) -> Optional[bool]:
+        """Check all attributes for check-related names."""
+        try:
+            for attr_name, attr_value in element.attrib.items():
+                if "checked" in attr_name.lower() or "val" in attr_name.lower() or "state" in attr_name.lower():
+                    return attr_value in ["1", "true", "checked", "on"]
+        except (AttributeError, TypeError):
+            pass
+        return None
+    @staticmethod
+    def _check_namespace_attributes(element: Any) -> Optional[bool]:
+        """Check attributes in all namespaces."""
+        try:
+            for ns, uri in element.nsmap.items():
+                for attr_name in ["val", "checked", "state", "default"]:
+                    attr_with_ns = "{%s}%s" % (uri, attr_name)
+                    if element.get(attr_with_ns) is not None:
+                        return element.get(attr_with_ns) in ["1", "true", "checked", "on"]
+        except (AttributeError, TypeError):
+            pass
+        return None
     def get_implementation_statuses(self) -> Dict:
         """
@@ -268,25 +506,58 @@ class AppendixAParser:
         :param Dict control_dict: The dictionary containing the control implementation data.
         :param str check: The check string to exclude from the part value.
         """
+        part_list = control_dict.get("parts", [])
         if cell_count > 1:
-            name = self.get_cell_text(cells[0]) if cells[0].text else DEFAULT_PART
-            value = self.get_cell_text(cells[1])
-            part_list = control_dict.get("parts", [])
-            val_dict = {"name": name, "value": value}
-            if check not in value.lower() and val_dict not in part_list:
-                part_list.append(val_dict)
-            control_dict["parts"] = part_list
+            self._handle_multicolumn_part(cells, part_list, check)
         else:
-            value = self.get_cell_text(cells[0])
-            value_lower = value.lower()
-            pattern = re.compile(r"\b(" + "|".join(re.escape(part) for part in self.parts_set) + r")\b", re.IGNORECASE)
-            match = pattern.search(value_lower)
-            name = match.group(1) if match else DEFAULT_PART
-            part_list = control_dict.get("parts", [])
-            val_dict = {"name": name, "value": value}
-            if check.lower() not in value_lower and val_dict not in part_list:
-                part_list.append(val_dict)
-            control_dict["parts"] = part_list
+            self._handle_single_column_part(cells[0], part_list, check)
+        control_dict["parts"] = part_list
+    def _handle_multicolumn_part(self, cells: Any, part_list: List, check: str):
+        """
+        Handle a part with multiple columns.
+        :param Any cells: The cells in the row.
+        :param List part_list: List to add parts to.
+        :param str check: The check string to exclude from part value.
+        """
+        name = self.get_cell_text(cells[0]) if cells[0].text else DEFAULT_PART
+        value = self.get_cell_text(cells[1])
+        val_dict = {"name": name, "value": value}
+        if check not in value.lower() and val_dict not in part_list:
+            part_list.append(val_dict)
+    def _handle_single_column_part(self, cell: Any, part_list: List, check: str):
+        """
+        Handle a part with a single column.
+        :param Any cell: The cell to process.
+        :param List part_list: List to add parts to.
+        :param str check: The check string to exclude from part value.
+        """
+        value = self.get_cell_text(cell)
+        value_lower = value.lower()
+        # Find part name using regex pattern
+        name = self._extract_part_name(value_lower)
+        val_dict = {"name": name, "value": value}
+        if check.lower() not in value_lower and val_dict not in part_list:
+            part_list.append(val_dict)
+    def _extract_part_name(self, text: str) -> str:
+        """
+        Extract part name from text using regex.
+        :param str text: The text to extract from.
+        :return: The extracted part name or default part name.
+        :rtype: str
+        """
+        pattern = re.compile(r"\b(" + "|".join(re.escape(part) for part in self.parts_set) + r")\b", re.IGNORECASE)
+        match = pattern.search(text)
+        return match.group(1) if match else DEFAULT_PART
     def set_cell_text(self, cell: Any):
         """
@@ -330,6 +601,8 @@ class AppendixAParser:
         self._handle_implementation_status()
         self._handle_implementation_origination()
         self._handle_implementation_statement()
+        # Comment out the implementation parts handling as it requires parameters not available in this context
+        # We'll rely on the handle_row_parts method to handle parts instead
         # self._handle_implementation_parts(cell_index, cells)
         self._handle_responsibility()
@@ -363,23 +636,44 @@ class AppendixAParser:
         """
         Handle the origination of the control implementation.
         """
+        origination_values = []
+        # Check if we're in a Control Summary section and have Control Origination text
         if (
-            self.cell_data_status
-            and any(
-                [self.score_similarity(self.cell_data_status.lower(), origin) > 90 for origin in LOWER_ORIGINATIONS]
-            )
-            and CONTROL_SUMMARY_KEY.lower() in self.header_row_text.lower()
+            CONTROL_SUMMARY_KEY.lower() in self.header_row_text.lower()
             and CONTROL_ORIGIN_KEY.lower() in self.joined_processed_texts.lower()
-            and self.header_row_text.split(" ")[0] in self.controls_implementations
+            and self.control_id in self.controls_implementations
+            and self.controls_implementations[self.control_id] is not None
         ):
-            if self.control_id in self.controls_implementations:
+            # Method 1: Check cell_data for origination values based on checkbox states
+            for key, value in self.cell_data.items():
+                if value and any(origin.lower() in key.lower() for origin in ORIGINATIONS):
+                    # Find the matching origination from the known list
+                    for origin in ORIGINATIONS:
+                        if origin.lower() in key.lower():
+                            logger.debug(f"Found origination from checkbox: {origin}")
+                            if origin not in origination_values:
+                                origination_values.append(origin)
+                            break
+            # Method 2: Try determine_origination as backup
+            if orig := self.determine_origination(self.joined_processed_texts):
+                logger.debug(f"Found origination from text: {orig}")
+                # Handle multiple comma-separated values in the determine_origination result
+                for origin in orig.split(","):
+                    if origin.strip() and origin.strip() not in origination_values:
+                        origination_values.append(origin.strip())
+            # Save all origination values as comma-delimited string
+            if origination_values:
                 control_dict = self.controls_implementations[self.control_id]
-                control_dict["origination"] = self.cell_data_status
-        elif origination := self.determine_origination(self.joined_processed_texts):
-            if origination in ORIGINATIONS:
-                if self.control_id in self.controls_implementations:
-                    control_dict = self.controls_implementations[self.control_id]
-                    control_dict["origination"] = origination
+                control_dict["origination"] = ",".join(origination_values)
+                logger.debug(f"Setting origination for {self.control_id}: {control_dict['origination']}")
+            elif DEFAULT_ORIGINATION:
+                # Set default if none found
+                control_dict = self.controls_implementations[self.control_id]
+                control_dict["origination"] = DEFAULT_ORIGINATION
+                logger.debug(f"Setting default origination for {self.control_id}: {DEFAULT_ORIGINATION}")
     def _handle_implementation_status(self):
         """
@@ -439,38 +733,80 @@ class AppendixAParser:
         """
         value_check = f"{self.control_id} What is the solution and how is it implemented?"
         generic_value_check = "What is the solution and how is it implemented".lower()
-        if (
+        # Skip processing if conditions aren't met
+        if not self._should_process_parts(value_check, generic_value_check):
+            return
+        part_value = self.joined_processed_texts.strip()
+        control_dict = self.controls_implementations.get(self.control_id, {})
+        part_list = control_dict.get("parts", [])
+        # Check if this is a part declaration
+        if not self._is_part_declaration(part_value):
+            return
+        part_name = part_value.strip() or DEFAULT_PART
+        part_value = self._combine_part_text(part_name, part_value, cell_index, cells)
+        # Build the part dictionary
+        self.build_part_dict(
+            part_name=part_name,
+            part_value=part_value,
+            control_dict=control_dict,
+            part_list=part_list,
+            generic_value_check=generic_value_check,
+        )
+    def _should_process_parts(self, value_check: str, generic_value_check: str) -> bool:
+        """
+        Determine if parts processing should continue.
+        :param str value_check: Value check string for this specific control
+        :param str generic_value_check: Generic value check string
+        :return: True if processing should continue, False otherwise
+        :rtype: bool
+        """
+        return (
             generic_value_check in self.header_row_text.lower()
             and value_check.lower() != self.joined_processed_texts.lower()
             and self.control_id in self.controls_implementations
-        ):
-            part_value = self.joined_processed_texts.strip()
-            control_dict = self.controls_implementations.get(self.control_id, {})
-            part_list = control_dict.get("parts", [])
-            if any(
-                [
-                    part_value.strip().lower() == p.lower() or part_value.strip().lower() == f"{p.lower()}:"
-                    for p in self.parts
-                ]
-            ):
-                part_name = part_value.strip() or DEFAULT_PART
-                next_cell_text = self.get_cell_text(cells[cell_index + 1])
-                if ":" not in part_value:
-                    part_value = ": ".join(
-                        [
-                            part_value.strip(),
-                            next_cell_text.strip(),
-                        ]
-                    )
-                else:
-                    part_value = " ".join([part_value.strip(), next_cell_text.strip()])
-                self.build_part_dict(
-                    part_name=part_name,
-                    part_value=part_value,
-                    control_dict=control_dict,
-                    part_list=part_list,
-                    generic_value_check=generic_value_check,
-                )
+        )
+    def _is_part_declaration(self, part_value: str) -> bool:
+        """
+        Check if the value is a part declaration.
+        :param str part_value: The value to check
+        :return: True if it's a part declaration, False otherwise
+        :rtype: bool
+        """
+        return any(
+            [
+                part_value.strip().lower() == p.lower() or part_value.strip().lower() == f"{p.lower()}:"
+                for p in self.parts
+            ]
+        )
+    def _combine_part_text(self, part_name: str, part_value: str, cell_index: int, cells: Any) -> str:
+        """
+        Combine part text from potentially multiple cells.
+        :param str part_name: Name of the part
+        :param str part_value: Current value text
+        :param int cell_index: Current cell index
+        :param Any cells: All cells in the row
+        :return: Combined part text
+        :rtype: str
+        """
+        next_cell_text = self.get_cell_text(cells[cell_index + 1])
+        if ":" not in part_value:
+            # If part_value doesn't have a colon, add the next cell's text after a colon
+            return ": ".join([part_value.strip(), next_cell_text.strip()])
+        else:
+            # If part_value already has a colon, just add the next cell's text
+            return " ".join([part_value.strip(), next_cell_text.strip()])
     def build_part_dict(
         self, part_name: str, part_value: str, control_dict: Dict, part_list: List, generic_value_check: str
@@ -484,24 +820,42 @@ class AppendixAParser:
         :param str generic_value_check: The generic value check string.
         """
         if part_value.lower().startswith("part"):
-            parts = part_value.split(":", 1)
-            part_dict = {"name": part_name, "value": DEFAULT_PART}
-            if len(parts) == 2 and parts[1].strip() != "":
-                part_dict["name"] = parts[0].strip()
-                part_dict["value"] = parts[1].strip()
-                logger.debug(f"Part: {part_dict}")
-                self.add_to_list(new_dict=part_dict, the_list=part_list)
-            elif part_value.strip() != "" and generic_value_check not in part_value.lower():
-                part_dict["value"] = part_value.strip()
-                self.add_to_list(new_dict=part_dict, the_list=part_list)
+            self._handle_part_value_starting_with_part(part_name, part_value, part_list, generic_value_check)
         elif generic_value_check not in part_value.lower():
+            # For values that don't start with "part" but are valid
             pdict = {
                 "name": DEFAULT_PART,
                 "value": part_value.strip(),
             }
             self.add_to_list(new_dict=pdict, the_list=part_list)
         control_dict["parts"] = part_list
+    def _handle_part_value_starting_with_part(
+        self, part_name: str, part_value: str, part_list: List, generic_value_check: str
+    ):
+        """
+        Handle part values that start with "part".
+        :param str part_name: The name of the part
+        :param str part_value: The value of the part
+        :param List part_list: The list to add parts to
+        :param str generic_value_check: The generic value check string
+        """
+        parts = part_value.split(":", 1)
+        part_dict = {"name": part_name, "value": DEFAULT_PART}
+        if len(parts) == 2 and parts[1].strip() != "":
+            # If part value has a colon and content after it
+            part_dict["name"] = parts[0].strip()
+            part_dict["value"] = parts[1].strip()
+            logger.debug(f"Part: {part_dict}")
+            self.add_to_list(new_dict=part_dict, the_list=part_list)
+        elif part_value.strip() != "" and generic_value_check not in part_value.lower():
+            # If part value has no colon but is not empty and not the generic check
+            part_dict["value"] = part_value.strip()
+            self.add_to_list(new_dict=part_dict, the_list=part_list)
     @staticmethod
     def add_to_list(new_dict: Dict, the_list: List):
         """
@@ -530,14 +884,55 @@ class AppendixAParser:
         """
         Handle the checkbox states in the DOCX table.
         """
-        updated_checkbox_states = [self._get_checkbox_state(state) for state in self.checkbox_states]
-        for item in self.processed_texts[1:]:
-            if isinstance(item, dict):
-                self.cell_data.update(item)
-            else:
-                self.cell_data[item.strip()] = updated_checkbox_states.pop(0) if updated_checkbox_states else None
+        try:
+            # Get checkbox states
+            updated_checkbox_states = []
+            for checkbox in self.checkbox_states:
+                try:
+                    is_checked = self._get_checkbox_state(checkbox)
+                    updated_checkbox_states.append(is_checked)
+                    logger.debug(f"Checkbox state: {is_checked}")
+                except Exception as e:
+                    # If we can't determine the state, assume it's not checked
+                    logger.debug(f"Error getting checkbox state: {e}")
+                    updated_checkbox_states.append(False)
+            # Log total checkboxes found
+            logger.debug(f"Found {len(updated_checkbox_states)} checkbox states: {updated_checkbox_states}")
+            # First handle any dictionary items in processed_texts
+            for item in self.processed_texts:
+                if isinstance(item, dict):
+                    self.cell_data.update(item)
+            # Handle text items with corresponding checkbox states
+            text_items = [item for item in self.processed_texts if not isinstance(item, dict)]
+            # Match checkbox states to text items
+            for i, item in enumerate(text_items):
+                if i < len(updated_checkbox_states):
+                    self.cell_data[item.strip()] = updated_checkbox_states[i]
+                else:
+                    # If we have more text items than checkbox states, assume unchecked
+                    self.cell_data[item.strip()] = False
+            # Also check for checkbox character directly in text
+            for key in list(self.cell_data.keys()):
+                # If text contains a checkbox character and state is False, try to determine true state from text
+                if not self.cell_data[key]:
+                    checkbox_chars = ["☒", "☑", "☑️", "✓", "✔", "✔️", "✅", "⬜", "▣", "■", "□", "⊠", "⊗", "×"]
+                    if any(char in key for char in checkbox_chars):
+                        self.cell_data[key] = True
+            # Update cell data status
             self._get_cell_data_status()
+        except Exception as e:
+            logger.debug(f"Error in _handle_checkbox_states: {e}")
+            # Ensure we don't leave checkbox_states empty
+            if not hasattr(self, "cell_data") or self.cell_data is None:
+                self.cell_data = {}
     def _get_cell_data_status(self):
         """
         Get the status of the cell data.