PyPI - txt2stix - Versions diffs - 0.0.4__py3-none-any.whl - Mend

txt2stix 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

txt2stix/__init__.py +33 -0
txt2stix/ai_extractor/__init__.py +15 -0
txt2stix/ai_extractor/anthropic.py +12 -0
txt2stix/ai_extractor/base.py +87 -0
txt2stix/ai_extractor/deepseek.py +19 -0
txt2stix/ai_extractor/gemini.py +18 -0
txt2stix/ai_extractor/openai.py +15 -0
txt2stix/ai_extractor/openrouter.py +20 -0
txt2stix/ai_extractor/prompts.py +164 -0
txt2stix/ai_extractor/utils.py +85 -0
txt2stix/attack_flow.py +101 -0
txt2stix/bundler.py +428 -0
txt2stix/common.py +23 -0
txt2stix/extractions.py +59 -0
txt2stix/includes/__init__.py +0 -0
txt2stix/includes/extractions/ai/config.yaml +1023 -0
txt2stix/includes/extractions/lookup/config.yaml +393 -0
txt2stix/includes/extractions/pattern/config.yaml +609 -0
txt2stix/includes/helpers/mimetype_filename_extension_list.csv +936 -0
txt2stix/includes/helpers/stix_relationship_types.txt +41 -0
txt2stix/includes/helpers/tlds.txt +1446 -0
txt2stix/includes/helpers/windows_registry_key_prefix.txt +12 -0
txt2stix/includes/lookups/_README.md +11 -0
txt2stix/includes/lookups/_generate_lookups.py +247 -0
txt2stix/includes/lookups/attack_pattern.txt +1 -0
txt2stix/includes/lookups/campaign.txt +1 -0
txt2stix/includes/lookups/country_iso3166_alpha2.txt +249 -0
txt2stix/includes/lookups/course_of_action.txt +1 -0
txt2stix/includes/lookups/disarm_id_v1_5.txt +345 -0
txt2stix/includes/lookups/disarm_name_v1_5.txt +347 -0
txt2stix/includes/lookups/extensions.txt +78 -0
txt2stix/includes/lookups/identity.txt +1 -0
txt2stix/includes/lookups/infrastructure.txt +1 -0
txt2stix/includes/lookups/intrusion_set.txt +1 -0
txt2stix/includes/lookups/malware.txt +2 -0
txt2stix/includes/lookups/mitre_atlas_id_v4_5_2.txt +116 -0
txt2stix/includes/lookups/mitre_atlas_name_v4_5_2.txt +117 -0
txt2stix/includes/lookups/mitre_attack_enterprise_aliases_v16_0.txt +1502 -0
txt2stix/includes/lookups/mitre_attack_enterprise_id_v16_0.txt +1656 -0
txt2stix/includes/lookups/mitre_attack_enterprise_name_v16_0.txt +1765 -0
txt2stix/includes/lookups/mitre_attack_ics_aliases_v16_0.txt +141 -0
txt2stix/includes/lookups/mitre_attack_ics_id_v16_0.txt +254 -0
txt2stix/includes/lookups/mitre_attack_ics_name_v16_0.txt +293 -0
txt2stix/includes/lookups/mitre_attack_mobile_aliases_v16_0.txt +159 -0
txt2stix/includes/lookups/mitre_attack_mobile_id_v16_0.txt +277 -0
txt2stix/includes/lookups/mitre_attack_mobile_name_v16_0.txt +296 -0
txt2stix/includes/lookups/mitre_capec_id_v3_9.txt +559 -0
txt2stix/includes/lookups/mitre_capec_name_v3_9.txt +560 -0
txt2stix/includes/lookups/mitre_cwe_id_v4_15.txt +939 -0
txt2stix/includes/lookups/mitre_cwe_name_v4_15.txt +939 -0
txt2stix/includes/lookups/threat_actor.txt +1 -0
txt2stix/includes/lookups/tld.txt +1422 -0
txt2stix/includes/lookups/tool.txt +1 -0
txt2stix/includes/tests/test_cases.yaml +695 -0
txt2stix/indicator.py +860 -0
txt2stix/lookups.py +68 -0
txt2stix/pattern/__init__.py +13 -0
txt2stix/pattern/extractors/__init__.py +0 -0
txt2stix/pattern/extractors/base_extractor.py +167 -0
txt2stix/pattern/extractors/card/README.md +34 -0
txt2stix/pattern/extractors/card/__init__.py +15 -0
txt2stix/pattern/extractors/card/amex_card_extractor.py +52 -0
txt2stix/pattern/extractors/card/diners_card_extractor.py +47 -0
txt2stix/pattern/extractors/card/discover_card_extractor.py +48 -0
txt2stix/pattern/extractors/card/jcb_card_extractor.py +43 -0
txt2stix/pattern/extractors/card/master_card_extractor.py +63 -0
txt2stix/pattern/extractors/card/union_card_extractor.py +38 -0
txt2stix/pattern/extractors/card/visa_card_extractor.py +46 -0
txt2stix/pattern/extractors/crypto/__init__.py +3 -0
txt2stix/pattern/extractors/crypto/btc_extractor.py +38 -0
txt2stix/pattern/extractors/directory/__init__.py +10 -0
txt2stix/pattern/extractors/directory/unix_directory_extractor.py +40 -0
txt2stix/pattern/extractors/directory/unix_file_path_extractor.py +42 -0
txt2stix/pattern/extractors/directory/windows_directory_path_extractor.py +47 -0
txt2stix/pattern/extractors/directory/windows_file_path_extractor.py +42 -0
txt2stix/pattern/extractors/domain/__init__.py +8 -0
txt2stix/pattern/extractors/domain/domain_extractor.py +39 -0
txt2stix/pattern/extractors/domain/hostname_extractor.py +36 -0
txt2stix/pattern/extractors/domain/sub_domain_extractor.py +49 -0
txt2stix/pattern/extractors/hashes/__init__.py +16 -0
txt2stix/pattern/extractors/hashes/md5_extractor.py +16 -0
txt2stix/pattern/extractors/hashes/sha1_extractor.py +14 -0
txt2stix/pattern/extractors/hashes/sha224_extractor.py +18 -0
txt2stix/pattern/extractors/hashes/sha2_256_exactor.py +14 -0
txt2stix/pattern/extractors/hashes/sha2_512_exactor.py +13 -0
txt2stix/pattern/extractors/hashes/sha3_256_exactor.py +15 -0
txt2stix/pattern/extractors/hashes/sha3_512_exactor.py +16 -0
txt2stix/pattern/extractors/helper.py +64 -0
txt2stix/pattern/extractors/ip/__init__.py +14 -0
txt2stix/pattern/extractors/ip/ipv4_cidr_extractor.py +49 -0
txt2stix/pattern/extractors/ip/ipv4_extractor.py +18 -0
txt2stix/pattern/extractors/ip/ipv4_port_extractor.py +42 -0
txt2stix/pattern/extractors/ip/ipv6_cidr_extractor.py +18 -0
txt2stix/pattern/extractors/ip/ipv6_extractor.py +16 -0
txt2stix/pattern/extractors/ip/ipv6_port_extractor.py +46 -0
txt2stix/pattern/extractors/others/__init__.py +22 -0
txt2stix/pattern/extractors/others/asn_extractor.py +14 -0
txt2stix/pattern/extractors/others/cpe_extractor.py +29 -0
txt2stix/pattern/extractors/others/cve_extractor.py +14 -0
txt2stix/pattern/extractors/others/email_extractor.py +21 -0
txt2stix/pattern/extractors/others/filename_extractor.py +17 -0
txt2stix/pattern/extractors/others/iban_extractor.py +15 -0
txt2stix/pattern/extractors/others/mac_address_extractor.py +13 -0
txt2stix/pattern/extractors/others/phonenumber_extractor.py +41 -0
txt2stix/pattern/extractors/others/user_agent_extractor.py +20 -0
txt2stix/pattern/extractors/others/windows_registry_key_extractor.py +18 -0
txt2stix/pattern/extractors/url/__init__.py +7 -0
txt2stix/pattern/extractors/url/url_extractor.py +22 -0
txt2stix/pattern/extractors/url/url_file_extractor.py +21 -0
txt2stix/pattern/extractors/url/url_path_extractor.py +74 -0
txt2stix/retriever.py +126 -0
txt2stix/stix.py +1 -0
txt2stix/txt2stix.py +336 -0
txt2stix/utils.py +86 -0
txt2stix-0.0.4.dist-info/METADATA +190 -0
txt2stix-0.0.4.dist-info/RECORD +119 -0
txt2stix-0.0.4.dist-info/WHEEL +4 -0
txt2stix-0.0.4.dist-info/entry_points.txt +2 -0
txt2stix-0.0.4.dist-info/licenses/LICENSE +202 -0

txt2stix/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+from txt2stix import extractions
+from .bundler import txt2stixBundler
+from .txt2stix import extract_all
+from pathlib import Path
+INCLUDES_PATH = None
+def get_include_path():
+    global INCLUDES_PATH
+    if INCLUDES_PATH:
+        return INCLUDES_PATH
+    from pathlib import Path
+    MODULE_PATH = Path(__file__).parent.parent
+    INCLUDES_PATH = MODULE_PATH/"includes"
+    try:
+        from . import includes
+        INCLUDES_PATH = Path(includes.__file__).parent
+    except:
+        pass
+    return INCLUDES_PATH
+def set_include_path(path):
+    global INCLUDES_PATH
+    INCLUDES_PATH = path
+def get_all_extractors(include_path=None):
+    return extractions.parse_extraction_config(include_path or get_include_path())
+__all__ = [
+    'txt2stixBundler', 'extract_all', 'get_include_path'
+]

txt2stix/ai_extractor/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+import logging
+import dotenv
+from .base import _ai_extractor_registry as ALL_AI_EXTRACTORS
+from .base import BaseAIExtractor
+class ModelError(Exception):
+    pass
+for path in ["openai", "anthropic", "gemini", "deepseek", "openrouter"]:
+    try:
+        __import__(__package__ + "." + path)
+    except Exception as e:
+        logging.warning("%s not supported, please install missing modules", path, exc_info=True)

txt2stix/ai_extractor/anthropic.py ADDED Viewed

@@ -0,0 +1,12 @@
+import logging
+import os
+from txt2stix.ai_extractor.base import BaseAIExtractor
+from llama_index.llms.anthropic import Anthropic
+class AnthropicAIExtractor(BaseAIExtractor, provider="anthropic"):
+    def __init__(self, **kwargs) -> None:
+        kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
+        self.llm = Anthropic(max_tokens=4096, system_prompt=self.system_prompt, **kwargs)
+        super().__init__()

txt2stix/ai_extractor/base.py ADDED Viewed

@@ -0,0 +1,87 @@
+import logging
+from typing import Type
+from llama_index.core.program import LLMTextCompletionProgram
+import textwrap
+from llama_index.core import PromptTemplate
+from llama_index.core.llms.llm import LLM
+from txt2stix.ai_extractor.prompts import DEFAULT_CONTENT_CHECKER_TEMPL, DEFAULT_EXTRACTION_TEMPL, DEFAULT_RELATIONSHIP_TEMPL, DEFAULT_SYSTEM_PROMPT, ATTACK_FLOW_PROMPT_TEMPL
+from txt2stix.ai_extractor.utils import AttackFlowList, DescribesIncident, ExtractionList, ParserWithLogging, RelationshipList, get_extractors_str
+from llama_index.core.utils import get_tokenizer
+_ai_extractor_registry: dict[str, 'Type[BaseAIExtractor]'] = {}
+class BaseAIExtractor():
+    system_prompt = DEFAULT_SYSTEM_PROMPT
+    extraction_template = DEFAULT_EXTRACTION_TEMPL
+    relationship_template = DEFAULT_RELATIONSHIP_TEMPL
+    content_check_template = DEFAULT_CONTENT_CHECKER_TEMPL
+    def _get_extraction_program(self):
+        return LLMTextCompletionProgram.from_defaults(
+            output_parser=ParserWithLogging(ExtractionList),
+            prompt=self.extraction_template,
+            verbose=True,
+            llm=self.llm,
+        )
+    def _get_relationship_program(self):
+        return LLMTextCompletionProgram.from_defaults(
+            output_parser=ParserWithLogging(RelationshipList),
+            prompt=self.relationship_template,
+            verbose=True,
+            llm=self.llm,
+        )
+    def _get_content_checker_program(self):
+        return LLMTextCompletionProgram.from_defaults(
+            output_parser=ParserWithLogging(DescribesIncident),
+            prompt=self.content_check_template,
+            verbose=True,
+            llm=self.llm,
+        )
+    def check_content(self, text) -> DescribesIncident:
+        return self._get_content_checker_program()(context_str=text)
+    def _get_attack_flow_program(self):
+        return LLMTextCompletionProgram.from_defaults(
+            output_parser=ParserWithLogging(AttackFlowList),
+            prompt=ATTACK_FLOW_PROMPT_TEMPL,
+            verbose=True,
+            llm=self.llm,
+        )
+    def extract_attack_flow(self, input_text, extractions, relationships) -> AttackFlowList:
+        return self._get_attack_flow_program()(document=input_text, extractions=extractions, relationships=relationships)
+    def extract_relationships(self, input_text, extractions, relationship_types: list[str]) -> RelationshipList:
+        return self._get_relationship_program()(relationship_types=relationship_types, input_file=input_text, extractions=extractions)
+    def extract_objects(self, input_text, extractors) -> ExtractionList:
+        return self._get_extraction_program()(extractors=get_extractors_str(extractors), input_file=input_text)
+    def __init__(self, *args, **kwargs) -> None:
+        pass
+    def count_tokens(self, input_text):
+        logging.info("unsupported model `%s`, estimating using llama-index's default tokenizer", self.extractor_name)
+        return len(get_tokenizer()(input_text))
+    def __init_subclass__(cls, /, provider, register=True, **kwargs):
+        super().__init_subclass__(**kwargs)
+        if register:
+            cls.provider = provider
+            _ai_extractor_registry[provider] = cls
+    @property
+    def extractor_name(self):
+        return f"{self.provider}:{self.llm.model}"
+    def __hash__(self):
+        return hash(self.extractor_name)

txt2stix/ai_extractor/deepseek.py ADDED Viewed

@@ -0,0 +1,19 @@
+import logging
+import os
+from .base import BaseAIExtractor
+from llama_index.llms.deepseek import DeepSeek
+class DeepseekExtractor(BaseAIExtractor, provider='deepseek'):
+    def __init__(self, **kwargs) -> None:
+        kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
+        kwargs.setdefault('model', 'deepseek-chat')
+        self.llm = DeepSeek(system_prompt=self.system_prompt, **kwargs)
+        super().__init__()
+    def count_tokens(self, text):
+        try:
+            return len(self.llm._tokenizer.encode(text))
+        except Exception as e:
+            logging.warning(e)
+            return super().count_tokens(text)

txt2stix/ai_extractor/gemini.py ADDED Viewed

@@ -0,0 +1,18 @@
+import os
+from txt2stix.ai_extractor.base import BaseAIExtractor
+from llama_index.llms.gemini import Gemini
+class GeminiAIExtractor(BaseAIExtractor, provider="gemini"):
+    def __init__(self, **kwargs) -> None:
+        kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
+        self.llm = Gemini(max_tokens=4096, **kwargs)
+        super().__init__()
+    def count_tokens(self, text):
+        return self.llm._model.count_tokens(text).total_tokens
+    @property
+    def extractor_name(self):
+        return f"{self.provider}:{self.llm.model}"

txt2stix/ai_extractor/openai.py ADDED Viewed

@@ -0,0 +1,15 @@
+import os
+from txt2stix.ai_extractor.base import BaseAIExtractor
+from llama_index.llms.openai import OpenAI
+class OpenAIExtractor(BaseAIExtractor, provider="openai"):
+    def __init__(self, **kwargs) -> None:
+        kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
+        self.llm = OpenAI(system_prompt=self.system_prompt, **kwargs)
+        super().__init__()
+    def count_tokens(self, text):
+        return len(self.llm._tokenizer.encode(text))

txt2stix/ai_extractor/openrouter.py ADDED Viewed

@@ -0,0 +1,20 @@
+import logging
+import os
+from .base import BaseAIExtractor
+from llama_index.llms.openrouter import OpenRouter
+class OpenRouterExtractor(BaseAIExtractor, provider="openrouter"):
+    def __init__(self, **kwargs) -> None:
+        kwargs.setdefault('temperature', float(os.environ.get('TEMPERATURE', 0.0)))
+        self.llm = OpenRouter(system_prompt=self.system_prompt, **kwargs)
+        super().__init__()
+    def count_tokens(self, text):
+        try:
+            return len(self.llm._tokenizer.encode(text))
+        except Exception as e:
+            logging.warning(e)
+            return super().count_tokens(text)

txt2stix/ai_extractor/prompts.py ADDED Viewed

@@ -0,0 +1,164 @@
+from llama_index.core import PromptTemplate, ChatPromptTemplate
+import textwrap
+from llama_index.core.base.llms.types import ChatMessage, MessageRole
+DEFAULT_SYSTEM_PROMPT = textwrap.dedent(
+"""
+<persona>
+    You are a cyber-security threat intelligence analysis tool responsible for analysing intelligence provided in text files.
+    You have a deep understanding of cybersecurity and threat intelligence concepts.
+    IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
+</persona>
+"""
+)
+DEFAULT_EXTRACTION_TEMPL = PromptTemplate(textwrap.dedent(
+    """
+    <persona>
+        You are a cyber-security threat intelligence analysis tool responsible for analysing intelligence provided in text files.
+        You have a deep understanding of cybersecurity and threat intelligence concepts.
+        IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
+    </persona>
+    <requirements>
+        Using the report text printed between the `<document>` tags, you should extract the Indicators of Compromise (IoCs) and Tactics, Techniques, and Procedures (TTPs) being described in it.
+        The document can contain the same IOC or TTP one or more times. Only create one record for each extraction -- the extractions must be unique!
+        Only one JSON object should exist for each unique value.
+    </requirements>
+    <accuracy>
+        Think about your answer first before you respond. The accuracy of your response is very important as this data will be used for operational purposes.
+        If you don't know the answer, reply with success: false, do not ever try to make up an answer.
+    </accuracy>
+    <document>
+    {input_file}
+    </document>
+    <extractors>
+    {extractors}
+    </extractors>
+    <response>
+        IMPORTANT: Only include a valid JSON document in your response and no other text. The JSON document should be minified!.
+        Response MUST be in JSON format.
+        Response MUST start with: {"success":
+    </response>
+    """
+))
+DEFAULT_RELATIONSHIP_TEMPL = PromptTemplate(textwrap.dedent(
+"""
+<persona>
+    You are a cyber-security threat intelligence analysis tool responsible for analysing intelligence provided in text files.
+    You have a deep understanding of cybersecurity and threat intelligence concepts.
+    IMPORTANT: You must always deliver your work as a computer-parsable output in JSON format. All output from you will be parsed with pydantic for further processing.
+</persona>
+<requirements>
+    The tag `<extractions>` contains all the observables and TTPs that were extracted from the document provided in `<document>`
+    Please capture the relationships between the extractions and describe them using NLP techniques.
+    A relationship MUST have different source_ref and target_ref
+    Select an appropriate relationship_type from `<relationship_types>`.
+    Only use `related-to` or any other vague `relationship_type` as a last resort.
+    The value of relationship_type MUST be clear, and it SHOULD NOT describe everything as related-to each other unless they are related in context of the `<document>
+    IMPORTANT: Only include a valid JSON document in your response and no other text. The JSON document should be minified!.
+</requirements>
+<accuracy>
+    Think about your answer first before you respond. The accuracy of your response is very important as this data will be used for operational purposes.
+    If you don't know the answer, reply with success: false, do not ever try to make up an answer.
+</accuracy>
+<document>
+{input_file}
+</document>
+<extractions>
+{extractions}
+</extractions>
+<relationship_types>
+{relationship_types}
+</relationship_types>
+<response>
+    IMPORTANT: Only include a valid JSON document in your response and no other text. The JSON document should be minified!.
+    Response MUST be in JSON format.
+    Response MUST start with: {"success":
+</response>
+"""
+))
+DEFAULT_CONTENT_CHECKER_TEMPL = PromptTemplate("""
+<persona>
+    You are a cyber security threat intelligence analyst.
+    Your job is to review reports that describe a cyber security incidents and/or threat intelligence.
+    Examples include malware analysis, APT group reports, data breaches, vulnerabilities, or Indicators of Compromise.
+    Some of the documents you are given will not be this type of report.
+    I need you to tell me if the text provided does match the type of report you are expecting.
+</persona>
+<requirement>
+    Using the MARKDOWN of the report provided in <document>
+    IMPORTANT: the output should be structured as valid JSON.
+    IMPORTANT: output should not be in markdown, it must be a plain JSON text without any code block
+    IMPORTANT: do not include any comment in the output
+    IMPORTANT: output must start with a `{` and end with a `}` and must not contain "```"
+</requirement>
+<document>
+{context_str}
+</document>
+<incident_classification>
+    Possible Incident Classifications are
+    * `other` (the report does not fit into any of the following categories)
+    * `apt_group`
+    * `vulnerability`
+    * `data_leak`
+    * `malware`
+    * `ransomware`
+    * `infostealer`
+    * `threat_actor`
+    * `campaign`
+    * `exploit`
+    * `cyber_crime`
+    * `indicator_of_compromise`
+    * `ttp`
+</incident_classification>
+""")
+ATTACK_FLOW_PROMPT_TEMPL = ChatPromptTemplate([
+    ChatMessage.from_str("""You are a cyber security threat intelligence analyst.
+Your job is to review report that describe a cyber security incidents.
+Examples include malware analysis, APT group reports, data breaches and vulnerabilities.""", MessageRole.SYSTEM),
+    ChatMessage.from_str("Hi, What <document> would you like me to process for you? the message below must contain the document and the document only", MessageRole.ASSISTANT),
+    ChatMessage.from_str("{document}", MessageRole.USER),
+    ChatMessage.from_str("What are the objects that have been extracted (<extractions>) from the document above?", MessageRole.ASSISTANT),
+    ChatMessage.from_str("{extractions}", MessageRole.USER),
+    ChatMessage.from_str("What are the relationships that have been extracted (<relationships>) between the documents?", MessageRole.USER),
+    ChatMessage.from_str("{relationships}", MessageRole.USER),
+    ChatMessage.from_str("What should I do with all the data that have been provided?", MessageRole.ASSISTANT),
+    ChatMessage.from_str("""Consider all the MITRE ATT&CK Objects extracted from the report and the relationships they have to other objects.
+Now I need you to logically define the order of ATT&CK Tactics/Techniques as they are executed in the incident described in the report.
+It is possible that the Techniques extracted are not linked to the relevant MITRE ATT&CK Tactic. You should also assign the correct Tactic to a Technique where a Technique belongs to many ATT&CK Tactics in the ATT&CK Matrix if that can correctly be inferred.
+You should also provide a short overview about how this technique is described in the report as the name, and a longer version in description.
+IMPORTANT: only include the ATT&CK IDs extracted already, do not add any new extractions.
+You should deliver a response in JSON as follows
+[
+{
+   "position": "<ORDER OF OBJECTS STARTING AT 0",
+   "attack_tactic_id": "<ID>",
+   "attack_technique_id": "<ID>",
+   "name": "<NAME>",
+   "description": "<DESC>"
+},
+{
+   "position": "<ORDER OF OBJECTS STARTING AT 0",
+   "attack_tactic_id": "<ID>",
+   "attack_technique_id": "<ID>",
+   "name": "<NAME>",
+   "description": "<DESC>"
+}
+]""", MessageRole.USER)
+])

txt2stix/ai_extractor/utils.py ADDED Viewed

@@ -0,0 +1,85 @@
+import io
+import json
+import logging
+import dotenv
+import textwrap
+from ..extractions import Extractor
+from pydantic import BaseModel, Field, RootModel
+from llama_index.core.output_parsers import PydanticOutputParser
+class Extraction(BaseModel):
+    type : str = Field(description="is the extraction_key value shown in the list printed earlier in this prompt")
+    id: str =  Field(description='is the id of the extraction of the format `"ai-%d" %(position in list)`, it should start from 1 (e.g `"ai-1", "ai-2", ..., "ai-n"`)')
+    value: str  =  Field(description='is the value extracted from the text')
+    original_text: str =  Field(description='is the original text the extraction was made from')
+    # start_index: list[str|int] =  Field(description='a list of the index positions of the first character for each matching extraction. Some documents might capture many extractions where `key` and `value` are the same for many entries. This property allows the user to identify how many extractions happened, and where they are in the document.')
+class Relationship(BaseModel):
+    source_ref: str = Field(description='is the id for the source extraction for the relationship (e.g. extraction_1).')
+    target_ref: str = Field(description='is the index for the target extraction for the relationship (e.g. extraction_2).')
+    relationship_type: str = Field(description='is a description of the relationship between target and source.')
+class ExtractionList(BaseModel):
+    extractions: list[Extraction] = Field(default_factory=list)
+    success: bool
+class RelationshipList(BaseModel):
+    relationships: list[Relationship] = Field(default_factory=list)
+    success: bool
+class DescribesIncident(BaseModel):
+    describes_incident: bool = Field(description="does the <document> include malware analysis, APT group reports, data breaches and vulnerabilities?")
+    explanation: str = Field(description="Two or three sentence summary of the incidents it describes OR summary of what it describes instead of an incident")
+    incident_classification : list[str] = Field(description="All the valid incident classifications that describe this document/report")
+class AttackFlowItem(BaseModel):
+    position : int = Field(description="order of object starting at 0")
+    attack_tactic_id : str
+    attack_technique_id : str
+    name: str
+    description: str
+class AttackFlowList(BaseModel):
+    matrix : str = Field(description="one of ics, mobile and enterprise")
+    items : list[AttackFlowItem]
+    success: bool = Field(description="determines if there's any valid flow in <extractions>")
+class ParserWithLogging(PydanticOutputParser):
+    def parse(self, text: str):
+        f = io.StringIO()
+        print("\n"*5 + "=================start=================", file=f)
+        print(text, file=f)
+        print("=================close=================" + "\n"*5, file=f)
+        logging.debug(f.getvalue())
+        return super().parse(text)
+def get_extractors_str(extractors):
+    extractor: Extractor = None
+    buffer = io.StringIO()
+    for extractor in extractors:
+        print(f"<extractor name={repr(extractor.name)} extraction_key={repr(extractor.extraction_key)}>", file=buffer)
+        print(f"- {extractor.prompt_base}", file=buffer)
+        if extractor.prompt_helper:
+            print(f"- {extractor.prompt_helper}", file=buffer)
+        if extractor.prompt_conversion:
+            print(f"- {extractor.prompt_conversion}", file=buffer)
+        if extractor.prompt_positive_examples:
+            print(f"- Here are some examples of what SHOULD be extracted for {extractor.name} extractions: {json.dumps(extractor.prompt_positive_examples)}", file=buffer)
+        if extractor.prompt_negative_examples:
+            print(f"- Here are some examples of what SHOULD NOT be extracted for {extractor.name} extractions: {json.dumps(extractor.prompt_negative_examples)}", file=buffer)
+        print("</extractor>", file=buffer)
+        print("\n"*2, file=buffer)
+    logging.debug("========   extractors   ======")
+    logging.debug(buffer.getvalue())
+    logging.debug("======== extractors end ======")
+    return buffer.getvalue()
+if __name__ == '__main__':
+    a = ExtractionList(extractions=[Extraction(type="yes", id="1", value="2", original_text="3")], success=True)
+    print(a.model_dump())

txt2stix/attack_flow.py ADDED Viewed

@@ -0,0 +1,101 @@
+import logging
+import uuid
+from stix2 import Relationship
+from txt2stix.common import UUID_NAMESPACE
+from txt2stix.retriever import STIXObjectRetriever
+from stix2extensions.attack_action import AttackAction, AttackFlow
+from stix2extensions._extensions import attack_flow_ExtensionDefinitionSMO
+from .utils import AttackFlowList
+def parse_flow(report, flow: AttackFlowList):
+    logging.info(f"flow.success = {flow.success}")
+    if not flow.success:
+        return []
+    attack_objects = STIXObjectRetriever().get_attack_objects(
+        flow.matrix,
+        [item.attack_tactic_id for item in flow.items]
+        + [item.attack_technique_id for item in flow.items],
+    )
+    attack_objects = {
+        obj["external_references"][0]["external_id"]: obj for obj in attack_objects
+    }
+    flow_objects = [report, attack_flow_ExtensionDefinitionSMO]
+    last_action = None
+    for i, item in enumerate(flow.items):
+        try:
+            tactic_obj = attack_objects[item.attack_tactic_id]
+            technique_obj = attack_objects[item.attack_technique_id]
+            action_obj = AttackAction(
+                **{
+                    "id": flow_id(
+                        report["id"], item.attack_technique_id, item.attack_tactic_id
+                    ),
+                    "effect_refs": [f"attack-action--{str(uuid.uuid4())}"],
+                    "technique_id": item.attack_technique_id,
+                    "technique_ref": technique_obj["id"],
+                    "tactic_id": item.attack_tactic_id,
+                    "tactic_ref": tactic_obj["id"],
+                    "name": item.name,
+                    "description": item.description,
+                },
+                allow_custom=True,
+            )
+            action_obj.effect_refs.clear()
+            if i == 0:
+                flow_obj = {
+                    "type": "attack-flow",
+                    "id": report.id.replace("report", "attack-flow"),
+                    "spec_version": "2.1",
+                    "created": report.created,
+                    "modified": report.modified,
+                    "created_by_ref": report.created_by_ref,
+                    "start_refs": [action_obj["id"]],
+                    "name": report.name,
+                    "description": report.description,
+                    "scope": "malware",
+                    "external_references": report.external_references,
+                    "object_marking_refs": report.object_marking_refs,
+                }
+                flow_objects.append(AttackFlow(**flow_obj))
+                flow_objects.append(
+                    Relationship(
+                        type="relationship",
+                        spec_version="2.1",
+                        id="relationship--"
+                        + str(uuid.uuid5(UUID_NAMESPACE, f"attack-flow+{report.id}")),
+                        created_by_ref=report.created_by_ref,
+                        created=report.created,
+                        modified=report.modified,
+                        relationship_type="attack-flow",
+                        description=f"Attack Flow for {report.name}",
+                        source_ref=report.id,
+                        target_ref=flow_obj["id"],
+                        external_references=report.external_references,
+                        object_marking_refs=report.object_marking_refs,
+                    )
+                )
+            else:
+                last_action["effect_refs"].append(action_obj["id"])
+            flow_objects.append(tactic_obj)
+            flow_objects.append(technique_obj)
+            flow_objects.append(action_obj)
+            last_action = action_obj
+        except Exception as e:
+            if flow_objects == 2:
+                logging.exception("FATAL: create attack flow object failed")
+                return []
+            logging.debug("create attack-action failed", exc_info=True)
+            raise
+    return flow_objects
+def flow_id(report_id, technique_id, tactic_id):
+    return "attack-action--" + str(
+        uuid.uuid5(
+            uuid.UUID(report_id.split("--")[-1]),
+            f"{report_id}+{technique_id}+{tactic_id}",
+        )
+    )