cat-llm 0.0.43__py3-none-any.whl → 0.0.51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_llm-0.0.43.dist-info → cat_llm-0.0.51.dist-info}/METADATA +1 -1
- cat_llm-0.0.51.dist-info/RECORD +15 -0
- catllm/CERAD_functions.py +4 -1
- catllm/__about__.py +1 -1
- catllm/__init__.py +2 -1
- catllm/build_web_research.py +169 -0
- catllm/image_functions.py +12 -4
- catllm/text_functions.py +4 -10
- cat_llm-0.0.43.dist-info/RECORD +0 -14
- {cat_llm-0.0.43.dist-info → cat_llm-0.0.51.dist-info}/WHEEL +0 -0
- {cat_llm-0.0.43.dist-info → cat_llm-0.0.51.dist-info}/licenses/LICENSE +0 -0
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.4
         | 
| 2 2 | 
             
            Name: cat-llm
         | 
| 3 | 
            -
            Version: 0.0. | 
| 3 | 
            +
            Version: 0.0.51
         | 
| 4 4 | 
             
            Summary: A tool for categorizing text data and images using LLMs and vision models
         | 
| 5 5 | 
             
            Project-URL: Documentation, https://github.com/chrissoria/cat-llm#readme
         | 
| 6 6 | 
             
            Project-URL: Issues, https://github.com/chrissoria/cat-llm/issues
         | 
| @@ -0,0 +1,15 @@ | |
| 1 | 
            +
            catllm/CERAD_functions.py,sha256=NNEu_Q10tClV7vRIVEgSQY8ujlXDbpWDzo1AbqlN7nQ,22462
         | 
| 2 | 
            +
            catllm/__about__.py,sha256=OJwRe3RgRegCUAurPg3XhGq2Bn2f28R8WdC5B0p4XPY,404
         | 
| 3 | 
            +
            catllm/__init__.py,sha256=sf02zp7N0NW0mAQi7eQ4gliWR1EwoqvXkHN2HwwjcTE,372
         | 
| 4 | 
            +
            catllm/build_web_research.py,sha256=gpYizrEe0ENUTZ8iyjzwvQj5kTXI15K_3rtt3yvwvUo,6927
         | 
| 5 | 
            +
            catllm/image_functions.py,sha256=Gz-djnXVaLT8GOR0sc8aPjjuC9L_gIT2AjUMjsjjmi0,35492
         | 
| 6 | 
            +
            catllm/text_functions.py,sha256=YK9BcpTbEo5FhkA5aiNfK8c72kyiW6AYzuILYNqGjqc,16603
         | 
| 7 | 
            +
            catllm/images/circle.png,sha256=JWujAWAh08-TajAoEr_TAeFNLlfbryOLw6cgIBREBuQ,86202
         | 
| 8 | 
            +
            catllm/images/cube.png,sha256=nFec3e5bmRe4zrBCJ8QK-HcJLrG7u7dYdKhmdMfacfE,77275
         | 
| 9 | 
            +
            catllm/images/diamond.png,sha256=rJDZKtsnBGRO8FPA0iHuA8FvHFGi9PkI_DWSFdw6iv0,99568
         | 
| 10 | 
            +
            catllm/images/overlapping_pentagons.png,sha256=VO5plI6eoVRnjfqinn1nNzsCP2WQhuQy71V0EASouW4,71208
         | 
| 11 | 
            +
            catllm/images/rectangles.png,sha256=2XM16HO9EYWj2yHgN4bPXaCwPfl7iYQy0tQUGaJX9xg,40692
         | 
| 12 | 
            +
            cat_llm-0.0.51.dist-info/METADATA,sha256=yT3cmlhUDNlHxl21i9jwHlad6lcy-FcVXZDbMnM6HYk,17514
         | 
| 13 | 
            +
            cat_llm-0.0.51.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
         | 
| 14 | 
            +
            cat_llm-0.0.51.dist-info/licenses/LICENSE,sha256=Vje2sS5WV4TnIwY5uQHrF4qnBAM3YOk1pGpdH0ot-2o,34969
         | 
| 15 | 
            +
            cat_llm-0.0.51.dist-info/RECORD,,
         | 
    
        catllm/CERAD_functions.py
    CHANGED
    
    | @@ -374,7 +374,10 @@ def cerad_drawn_score( | |
| 374 374 | 
             
                normalized_data = pd.concat(normalized_data_list, ignore_index=True)
         | 
| 375 375 |  | 
| 376 376 | 
             
                categorized_data = pd.DataFrame({
         | 
| 377 | 
            -
                    'image_input':  | 
| 377 | 
            +
                    'image_input': (
         | 
| 378 | 
            +
                        image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series)) 
         | 
| 379 | 
            +
                        else pd.Series(image_files)
         | 
| 380 | 
            +
                    ),
         | 
| 378 381 | 
             
                    'link1': pd.Series(link1).reset_index(drop=True),
         | 
| 379 382 | 
             
                    'json': pd.Series(extracted_jsons).reset_index(drop=True)
         | 
| 380 383 | 
             
                })
         | 
    
        catllm/__about__.py
    CHANGED
    
    
    
        catllm/__init__.py
    CHANGED
    
    
| @@ -0,0 +1,169 @@ | |
| 1 | 
            +
            #build dataset classification
         | 
| 2 | 
            +
            def build_web_research_dataset(
         | 
| 3 | 
            +
                search_question, 
         | 
| 4 | 
            +
                search_input,
         | 
| 5 | 
            +
                api_key,
         | 
| 6 | 
            +
                answer_format = "concise",
         | 
| 7 | 
            +
                additional_instructions = "",
         | 
| 8 | 
            +
                categories = ['Answer','URL'],
         | 
| 9 | 
            +
                user_model="claude-3-7-sonnet-20250219",
         | 
| 10 | 
            +
                creativity=0,
         | 
| 11 | 
            +
                safety=False,
         | 
| 12 | 
            +
                filename="categorized_data.csv",
         | 
| 13 | 
            +
                save_directory=None,
         | 
| 14 | 
            +
                model_source="Anthropic",
         | 
| 15 | 
            +
                time_delay=5
         | 
| 16 | 
            +
            ):
         | 
| 17 | 
            +
                import os
         | 
| 18 | 
            +
                import json
         | 
| 19 | 
            +
                import pandas as pd
         | 
| 20 | 
            +
                import regex
         | 
| 21 | 
            +
                from tqdm import tqdm
         | 
| 22 | 
            +
                import time
         | 
| 23 | 
            +
                
         | 
| 24 | 
            +
                categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
         | 
| 25 | 
            +
                print(categories_str)
         | 
| 26 | 
            +
                cat_num = len(categories)
         | 
| 27 | 
            +
                category_dict = {str(i+1): "0" for i in range(cat_num)}
         | 
| 28 | 
            +
                example_JSON = json.dumps(category_dict, indent=4)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                # ensure number of categories is what user wants
         | 
| 31 | 
            +
                #print("\nThe information to be extracted:")
         | 
| 32 | 
            +
                #for i, cat in enumerate(categories, 1):
         | 
| 33 | 
            +
                    #print(f"{i}. {cat}")
         | 
| 34 | 
            +
                
         | 
| 35 | 
            +
                link1 = []
         | 
| 36 | 
            +
                extracted_jsons = []
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                for idx, item in enumerate(tqdm(search_input, desc="Building dataset")):
         | 
| 39 | 
            +
                    if idx == 0:  # delay the first item just to be safe
         | 
| 40 | 
            +
                        time.sleep(time_delay)
         | 
| 41 | 
            +
                    reply = None  
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                    if pd.isna(item): 
         | 
| 44 | 
            +
                        link1.append("Skipped NaN input")
         | 
| 45 | 
            +
                        default_json = example_JSON 
         | 
| 46 | 
            +
                        extracted_jsons.append(default_json)
         | 
| 47 | 
            +
                        #print(f"Skipped NaN input.")
         | 
| 48 | 
            +
                    else:
         | 
| 49 | 
            +
                        prompt = f"""<role>You are a research assistant specializing in finding current, factual information.</role>
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                        <task>Find information about {item}'s {search_question}</task>
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                        <rules>
         | 
| 54 | 
            +
                        - Search for the most current and authoritative information available
         | 
| 55 | 
            +
                        - Provide your answer as {answer_format}
         | 
| 56 | 
            +
                        - Prioritize official sources when possible
         | 
| 57 | 
            +
                        - If information is not found, state "Information not found"
         | 
| 58 | 
            +
                        - Include exactly one source URL where you found the information
         | 
| 59 | 
            +
                        - Do not include any explanatory text or commentary beyond the JSON
         | 
| 60 | 
            +
                            {additional_instructions}
         | 
| 61 | 
            +
                        </rules>
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                        <format>
         | 
| 64 | 
            +
                        Return your response as valid JSON with this exact structure:
         | 
| 65 | 
            +
                        {{
         | 
| 66 | 
            +
                        "answer": "Your factual answer or 'Information not found'",
         | 
| 67 | 
            +
                        "url": "Source URL or 'No source available'"
         | 
| 68 | 
            +
                    }}
         | 
| 69 | 
            +
                    </format>"""
         | 
| 70 | 
            +
                        #print(prompt)
         | 
| 71 | 
            +
                        if model_source == "Anthropic":
         | 
| 72 | 
            +
                            import anthropic
         | 
| 73 | 
            +
                            client = anthropic.Anthropic(api_key=api_key)
         | 
| 74 | 
            +
                            try:
         | 
| 75 | 
            +
                                message = client.messages.create(
         | 
| 76 | 
            +
                                model=user_model,
         | 
| 77 | 
            +
                                max_tokens=1024,
         | 
| 78 | 
            +
                                temperature=creativity,
         | 
| 79 | 
            +
                                messages=[{"role": "user", "content": prompt}],
         | 
| 80 | 
            +
                                tools=[{
         | 
| 81 | 
            +
                                "type": "web_search_20250305", 
         | 
| 82 | 
            +
                                "name": "web_search"
         | 
| 83 | 
            +
                                }]
         | 
| 84 | 
            +
                            )
         | 
| 85 | 
            +
                                reply = " ".join(
         | 
| 86 | 
            +
                                    block.text
         | 
| 87 | 
            +
                                    for block in message.content
         | 
| 88 | 
            +
                                    if getattr(block, "type", "") == "text"
         | 
| 89 | 
            +
                                ).strip()
         | 
| 90 | 
            +
                                link1.append(reply)
         | 
| 91 | 
            +
                                time.sleep(time_delay)
         | 
| 92 | 
            +
                                print(reply)
         | 
| 93 | 
            +
                                
         | 
| 94 | 
            +
                            except Exception as e:
         | 
| 95 | 
            +
                                print(f"An error occurred: {e}")
         | 
| 96 | 
            +
                                link1.append(f"Error processing input: {e}")
         | 
| 97 | 
            +
                                time.sleep(time_delay)
         | 
| 98 | 
            +
                        else:
         | 
| 99 | 
            +
                            raise ValueError("Unknown source! Currently this function only supports 'Anthropic' as model_source.")
         | 
| 100 | 
            +
                        # in situation that no JSON is found
         | 
| 101 | 
            +
                        if reply is not None:
         | 
| 102 | 
            +
                            extracted_json = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
         | 
| 103 | 
            +
                            if extracted_json:
         | 
| 104 | 
            +
                                raw_json = extracted_json[0].strip()  # Only strip leading/trailing whitespace
         | 
| 105 | 
            +
                                try:
         | 
| 106 | 
            +
                                    # Parse to validate JSON structure
         | 
| 107 | 
            +
                                    parsed_obj = json.loads(raw_json)
         | 
| 108 | 
            +
                                    # Re-serialize for consistent formatting (optional)
         | 
| 109 | 
            +
                                    cleaned_json = json.dumps(parsed_obj)
         | 
| 110 | 
            +
                                    extracted_jsons.append(cleaned_json)
         | 
| 111 | 
            +
                                except json.JSONDecodeError as e:
         | 
| 112 | 
            +
                                    print(f"JSON parsing error: {e}")
         | 
| 113 | 
            +
                                    # Fallback to raw extraction if parsing fails
         | 
| 114 | 
            +
                                    extracted_jsons.append(raw_json)
         | 
| 115 | 
            +
                            else:
         | 
| 116 | 
            +
                                # Use consistent schema for errors
         | 
| 117 | 
            +
                                error_message = json.dumps({"answer": "e", "url": "e"})
         | 
| 118 | 
            +
                                extracted_jsons.append(error_message)
         | 
| 119 | 
            +
                                print(error_message)
         | 
| 120 | 
            +
                        else:
         | 
| 121 | 
            +
                            # Handle None reply case
         | 
| 122 | 
            +
                            error_message = json.dumps({"answer": "e", "url": "e"})
         | 
| 123 | 
            +
                            extracted_jsons.append(error_message)
         | 
| 124 | 
            +
                            #print(error_message)
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                    # --- Safety Save ---
         | 
| 127 | 
            +
                    if safety:
         | 
| 128 | 
            +
                        # Save progress so far
         | 
| 129 | 
            +
                        temp_df = pd.DataFrame({
         | 
| 130 | 
            +
                            'survey_response': search_input[:idx+1],
         | 
| 131 | 
            +
                            'link1': link1,
         | 
| 132 | 
            +
                            'json': extracted_jsons
         | 
| 133 | 
            +
                        })
         | 
| 134 | 
            +
                        # Normalize processed jsons so far
         | 
| 135 | 
            +
                        normalized_data_list = []
         | 
| 136 | 
            +
                        for json_str in extracted_jsons:
         | 
| 137 | 
            +
                            try:
         | 
| 138 | 
            +
                                parsed_obj = json.loads(json_str)
         | 
| 139 | 
            +
                                normalized_data_list.append(pd.json_normalize(parsed_obj))
         | 
| 140 | 
            +
                            except json.JSONDecodeError:
         | 
| 141 | 
            +
                                normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
         | 
| 142 | 
            +
                        normalized_data = pd.concat(normalized_data_list, ignore_index=True)
         | 
| 143 | 
            +
                        temp_df = pd.concat([temp_df, normalized_data], axis=1)
         | 
| 144 | 
            +
                        # Save to CSV
         | 
| 145 | 
            +
                        if save_directory is None:
         | 
| 146 | 
            +
                            save_directory = os.getcwd()
         | 
| 147 | 
            +
                        temp_df.to_csv(os.path.join(save_directory, filename), index=False)
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                # --- Final DataFrame ---
         | 
| 150 | 
            +
                normalized_data_list = []
         | 
| 151 | 
            +
                for json_str in extracted_jsons:
         | 
| 152 | 
            +
                    try:
         | 
| 153 | 
            +
                        parsed_obj = json.loads(json_str)
         | 
| 154 | 
            +
                        normalized_data_list.append(pd.json_normalize(parsed_obj))
         | 
| 155 | 
            +
                    except json.JSONDecodeError:
         | 
| 156 | 
            +
                        normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
         | 
| 157 | 
            +
                normalized_data = pd.concat(normalized_data_list, ignore_index=True)
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                categorized_data = pd.DataFrame({
         | 
| 160 | 
            +
                    'survey_response': (
         | 
| 161 | 
            +
                        search_input.reset_index(drop=True) if isinstance(search_input, (pd.DataFrame, pd.Series)) 
         | 
| 162 | 
            +
                        else pd.Series(search_input)
         | 
| 163 | 
            +
                    ),
         | 
| 164 | 
            +
                    'link1': pd.Series(link1).reset_index(drop=True),
         | 
| 165 | 
            +
                    'json': pd.Series(extracted_jsons).reset_index(drop=True)
         | 
| 166 | 
            +
                })
         | 
| 167 | 
            +
                categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
         | 
| 168 | 
            +
                
         | 
| 169 | 
            +
                return categorized_data
         | 
    
        catllm/image_functions.py
    CHANGED
    
    | @@ -252,9 +252,11 @@ def image_multi_class( | |
| 252 252 | 
             
                    except json.JSONDecodeError:
         | 
| 253 253 | 
             
                        normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
         | 
| 254 254 | 
             
                normalized_data = pd.concat(normalized_data_list, ignore_index=True)
         | 
| 255 | 
            -
             | 
| 256 255 | 
             
                categorized_data = pd.DataFrame({
         | 
| 257 | 
            -
                    'image_input':  | 
| 256 | 
            +
                    'image_input': (
         | 
| 257 | 
            +
                        image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series)) 
         | 
| 258 | 
            +
                        else pd.Series(image_files)
         | 
| 259 | 
            +
                    ),
         | 
| 258 260 | 
             
                    'link1': pd.Series(link1).reset_index(drop=True),
         | 
| 259 261 | 
             
                    'json': pd.Series(extracted_jsons).reset_index(drop=True)
         | 
| 260 262 | 
             
                })
         | 
| @@ -549,7 +551,10 @@ def image_score_drawing( | |
| 549 551 | 
             
                normalized_data = pd.concat(normalized_data_list, ignore_index=True)
         | 
| 550 552 |  | 
| 551 553 | 
             
                categorized_data = pd.DataFrame({
         | 
| 552 | 
            -
                    'image_input':  | 
| 554 | 
            +
                    'image_input': (
         | 
| 555 | 
            +
                        image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series)) 
         | 
| 556 | 
            +
                        else pd.Series(image_files)
         | 
| 557 | 
            +
                    ),
         | 
| 553 558 | 
             
                    'link1': pd.Series(link1).reset_index(drop=True),
         | 
| 554 559 | 
             
                    'json': pd.Series(extracted_jsons).reset_index(drop=True)
         | 
| 555 560 | 
             
                })
         | 
| @@ -835,7 +840,10 @@ def image_features( | |
| 835 840 | 
             
                normalized_data = pd.concat(normalized_data_list, ignore_index=True)
         | 
| 836 841 |  | 
| 837 842 | 
             
                categorized_data = pd.DataFrame({
         | 
| 838 | 
            -
                    'image_input':  | 
| 843 | 
            +
                    'image_input': (
         | 
| 844 | 
            +
                        image_files.reset_index(drop=True) if isinstance(image_files, (pd.DataFrame, pd.Series)) 
         | 
| 845 | 
            +
                        else pd.Series(image_files)
         | 
| 846 | 
            +
                    ),
         | 
| 839 847 | 
             
                    'link1': pd.Series(link1).reset_index(drop=True),
         | 
| 840 848 | 
             
                    'json': pd.Series(extracted_jsons).reset_index(drop=True)
         | 
| 841 849 | 
             
                })
         | 
    
        catllm/text_functions.py
    CHANGED
    
    | @@ -373,20 +373,14 @@ Provide your work in JSON format where the number belonging to each category is | |
| 373 373 | 
             
                    except json.JSONDecodeError:
         | 
| 374 374 | 
             
                        normalized_data_list.append(pd.DataFrame({"1": ["e"]}))
         | 
| 375 375 | 
             
                normalized_data = pd.concat(normalized_data_list, ignore_index=True)
         | 
| 376 | 
            -
             | 
| 377 376 | 
             
                categorized_data = pd.DataFrame({
         | 
| 378 | 
            -
                    ' | 
| 377 | 
            +
                    'image_input': (
         | 
| 378 | 
            +
                        survey_input.reset_index(drop=True) if isinstance(survey_input, (pd.DataFrame, pd.Series)) 
         | 
| 379 | 
            +
                        else pd.Series(survey_input)
         | 
| 380 | 
            +
                    ),
         | 
| 379 381 | 
             
                    'link1': pd.Series(link1).reset_index(drop=True),
         | 
| 380 382 | 
             
                    'json': pd.Series(extracted_jsons).reset_index(drop=True)
         | 
| 381 383 | 
             
                })
         | 
| 382 384 | 
             
                categorized_data = pd.concat([categorized_data, normalized_data], axis=1)
         | 
| 383 385 |  | 
| 384 | 
            -
                if columns != "numbered": #if user wants text columns
         | 
| 385 | 
            -
                    categorized_data.columns = list(categorized_data.columns[:3]) + categories[:len(categorized_data.columns) - 3]
         | 
| 386 | 
            -
             | 
| 387 | 
            -
                if to_csv:
         | 
| 388 | 
            -
                    if save_directory is None:
         | 
| 389 | 
            -
                        save_directory = os.getcwd()
         | 
| 390 | 
            -
                    categorized_data.to_csv(os.path.join(save_directory, filename), index=False)
         | 
| 391 | 
            -
                
         | 
| 392 386 | 
             
                return categorized_data
         | 
    
        cat_llm-0.0.43.dist-info/RECORD
    DELETED
    
    | @@ -1,14 +0,0 @@ | |
| 1 | 
            -
            catllm/CERAD_functions.py,sha256=Di8AGcBtc5OiVPoGUnpHhtwZBBPlOWTKmlSYOYKsck0,22320
         | 
| 2 | 
            -
            catllm/__about__.py,sha256=_1VK5w3uEjL5HtCln4mwW5YvqNAsSt5RgHTzFfk0fUE,404
         | 
| 3 | 
            -
            catllm/__init__.py,sha256=BpAG8nPhM3ZQRd0WqkubI_36-VCOs4eCYtGVgzz48Bs,337
         | 
| 4 | 
            -
            catllm/image_functions.py,sha256=w1Q1qoJqVp570AxnaBNpKpQ0bitl50aprCzgdVqhtGY,35067
         | 
| 5 | 
            -
            catllm/text_functions.py,sha256=K6oetWYk25PwsllWSZP4cFrz7kyxJg0plPRvpmQkCsU,16846
         | 
| 6 | 
            -
            catllm/images/circle.png,sha256=JWujAWAh08-TajAoEr_TAeFNLlfbryOLw6cgIBREBuQ,86202
         | 
| 7 | 
            -
            catllm/images/cube.png,sha256=nFec3e5bmRe4zrBCJ8QK-HcJLrG7u7dYdKhmdMfacfE,77275
         | 
| 8 | 
            -
            catllm/images/diamond.png,sha256=rJDZKtsnBGRO8FPA0iHuA8FvHFGi9PkI_DWSFdw6iv0,99568
         | 
| 9 | 
            -
            catllm/images/overlapping_pentagons.png,sha256=VO5plI6eoVRnjfqinn1nNzsCP2WQhuQy71V0EASouW4,71208
         | 
| 10 | 
            -
            catllm/images/rectangles.png,sha256=2XM16HO9EYWj2yHgN4bPXaCwPfl7iYQy0tQUGaJX9xg,40692
         | 
| 11 | 
            -
            cat_llm-0.0.43.dist-info/METADATA,sha256=JWOKRT2ERmN_wxNgbFLtW_xRyCwA79aodwiinqNCOC4,17514
         | 
| 12 | 
            -
            cat_llm-0.0.43.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
         | 
| 13 | 
            -
            cat_llm-0.0.43.dist-info/licenses/LICENSE,sha256=Vje2sS5WV4TnIwY5uQHrF4qnBAM3YOk1pGpdH0ot-2o,34969
         | 
| 14 | 
            -
            cat_llm-0.0.43.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         |