PyPI - lexoid - Versions diffs - 0.1.8.post1__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

lexoid 0.1.8.post1py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

lexoid/api.py +92 -47
lexoid/core/parse_type/llm_parser.py +82 -132
lexoid/core/parse_type/static_parser.py +83 -67
lexoid/core/utils.py +68 -46
{lexoid-0.1.8.post1.dist-info → lexoid-0.1.10.dist-info}/METADATA +4 -4
lexoid-0.1.10.dist-info/RECORD +9 -0
lexoid-0.1.8.post1.dist-info/RECORD +0 -9
{lexoid-0.1.8.post1.dist-info → lexoid-0.1.10.dist-info}/LICENSE +0 -0
{lexoid-0.1.8.post1.dist-info → lexoid-0.1.10.dist-info}/WHEEL +0 -0

lexoid/api.py CHANGED Viewed

@@ -19,6 +19,7 @@ from lexoid.core.utils import (
     recursive_read_html,
     router,
     split_pdf,
+    create_sub_pdf,
 )
@@ -28,20 +29,24 @@ class ParserType(Enum):
     AUTO = "AUTO"
-def parse_chunk(
-    path: str, parser_type: ParserType, raw: bool, **kwargs
-) -> List[Dict] | str:
+def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
     """
     Parses a file using the specified parser type.
     Args:
         path (str): The file path or URL.
         parser_type (ParserType): The type of parser to use (LLM_PARSE, STATIC_PARSE, or AUTO).
-        raw (bool): Whether to return raw text or structured data.
         **kwargs: Additional arguments for the parser.
     Returns:
-        List[Dict] | str: Parsed document data as a list of dictionaries or raw text.
+        Dict: Dictionary containing:
+            - raw: Full markdown content as string
+            - segments: List of dictionaries with metadata and content
+            - title: Title of the document
+            - url: URL if applicable
+            - parent_title: Title of parent doc if recursively parsed
+            - recursive_docs: List of dictionaries for recursively parsed documents
+            - token_usage: Dictionary containing token usage statistics
     """
     if parser_type == ParserType.AUTO:
         parser_type = ParserType[router(path)]
@@ -52,63 +57,81 @@ def parse_chunk(
     )
     if parser_type == ParserType.STATIC_PARSE:
         logger.debug("Using static parser")
-        return parse_static_doc(path, raw, **kwargs)
+        return parse_static_doc(path, **kwargs)
     else:
         logger.debug("Using LLM parser")
-        return parse_llm_doc(path, raw, **kwargs)
+        return parse_llm_doc(path, **kwargs)
 def parse_chunk_list(
-    file_paths: List[str], parser_type: ParserType, raw: bool, kwargs: Dict
-) -> List[Dict | str]:
+    file_paths: List[str], parser_type: ParserType, kwargs: Dict
+) -> Dict:
     """
     Parses a list of files using the specified parser type.
     Args:
         file_paths (list): List of file paths.
         parser_type (ParserType): The type of parser to use.
-        raw (bool): Whether to return raw text or structured data.
         kwargs (dict): Additional arguments for the parser.
     Returns:
-        List[Dict | str]: List of parsed documents with raw text and/or metadata.
+        Dict: Dictionary containing parsed document data
     """
-    local_docs = []
+    combined_segments = []
+    raw_texts = []
+    token_usage = {"input": 0, "output": 0}
     for file_path in file_paths:
-        result = parse_chunk(file_path, parser_type, raw, **kwargs)
-        if isinstance(result, list):
-            local_docs.extend(result)
-        else:
-            local_docs.append(result.replace("<page break>", "\n\n"))
-    return local_docs
+        result = parse_chunk(file_path, parser_type, **kwargs)
+        combined_segments.extend(result["segments"])
+        raw_texts.append(result["raw"])
+        if "token_usage" in result:
+            token_usage["input"] += result["token_usage"]["input"]
+            token_usage["output"] += result["token_usage"]["output"]
+    token_usage["total"] = token_usage["input"] + token_usage["output"]
+    return {
+        "raw": "\n\n".join(raw_texts),
+        "segments": combined_segments,
+        "title": kwargs.get("title", ""),
+        "url": kwargs.get("url", ""),
+        "parent_title": kwargs.get("parent_title", ""),
+        "recursive_docs": [],
+        "token_usage": token_usage,
+    }
 def parse(
     path: str,
     parser_type: Union[str, ParserType] = "LLM_PARSE",
-    raw: bool = False,
     pages_per_split: int = 4,
     max_processes: int = 4,
     **kwargs,
-) -> Union[List[Dict], str]:
+) -> Dict:
     """
     Parses a document or URL, optionally splitting it into chunks and using multiprocessing.
     Args:
         path (str): The file path or URL.
-        parser_type (Union[str, ParserType], optional): The type of parser to use ("LLM_PARSE", "STATIC_PARSE", or "AUTO"). Defaults to "LLM_PARSE".
-        raw (bool, optional): Whether to return raw text or structured data. Defaults to False.
-        pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
-        max_processes (int, optional): Maximum number of processes for parallel processing. Defaults to 4.
+        parser_type (Union[str, ParserType], optional): Parser type ("LLM_PARSE", "STATIC_PARSE", or "AUTO").
+        pages_per_split (int, optional): Number of pages per split for chunking.
+        max_processes (int, optional): Maximum number of processes for parallel processing.
         **kwargs: Additional arguments for the parser.
     Returns:
-        Union[List[Dict], str]: Parsed document data as a list of dictionaries or raw text.
+        Dict: Dictionary containing:
+            - raw: Full markdown content as string
+            - segments: List of dictionaries with metadata and content
+            - title: Title of the document
+            - url: URL if applicable
+            - parent_title: Title of parent doc if recursively parsed
+            - recursive_docs: List of dictionaries for recursively parsed documents
+            - token_usage: Dictionary containing token usage statistics
     """
     kwargs["title"] = os.path.basename(path)
     kwargs["pages_per_split_"] = pages_per_split
     as_pdf = kwargs.get("as_pdf", False)
     depth = kwargs.get("depth", 1)
     if type(parser_type) == str:
         parser_type = ParserType[parser_type]
@@ -120,15 +143,19 @@ def parse(
             as_pdf = True
         if path.startswith(("http://", "https://")):
-            download_dir = os.path.join(temp_dir, "downloads/")
+            kwargs["url"] = path
+            download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
             os.makedirs(download_dir, exist_ok=True)
             if is_supported_url_file_type(path):
                 path = download_file(path, download_dir)
             elif as_pdf:
-                pdf_path = os.path.join(download_dir, f"webpage_{int(time())}.pdf")
+                pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
+                if not pdf_filename.endswith(".pdf"):
+                    pdf_filename += ".pdf"
+                pdf_path = os.path.join(download_dir, pdf_filename)
                 path = convert_to_pdf(path, pdf_path)
             else:
-                return recursive_read_html(path, depth, raw)
+                return recursive_read_html(path, depth)
         assert is_supported_file_type(
             path
@@ -138,11 +165,15 @@ def parse(
             pdf_path = os.path.join(temp_dir, "converted.pdf")
             path = convert_to_pdf(path, pdf_path)
+        if "page_nums" in kwargs and path.lower().endswith(".pdf"):
+            sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
+            os.makedirs(sub_pdf_dir, exist_ok=True)
+            sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
+            path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
         if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
             kwargs["split"] = False
-            all_docs = parse_chunk(path, parser_type, raw, **kwargs)
-            if raw:
-                all_docs = [all_docs]
+            result = parse_chunk(path, parser_type, **kwargs)
         else:
             kwargs["split"] = True
             split_dir = os.path.join(temp_dir, "splits/")
@@ -156,22 +187,39 @@ def parse(
                 for i in range(0, len(split_files), chunk_size)
             ]
-            process_args = [(chunk, parser_type, raw, kwargs) for chunk in file_chunks]
+            process_args = [(chunk, parser_type, kwargs) for chunk in file_chunks]
             if max_processes == 1 or len(file_chunks) == 1:
-                all_docs = [parse_chunk_list(*args) for args in process_args]
+                chunk_results = [parse_chunk_list(*args) for args in process_args]
             else:
                 with ProcessPoolExecutor(max_workers=max_processes) as executor:
-                    all_docs = list(executor.map(parse_chunk_list, *zip(*process_args)))
-            all_docs = [item for sublist in all_docs for item in sublist]
+                    chunk_results = list(
+                        executor.map(parse_chunk_list, *zip(*process_args))
+                    )
+            # Combine results from all chunks
+            result = {
+                "raw": "\n\n".join(r["raw"] for r in chunk_results),
+                "segments": [seg for r in chunk_results for seg in r["segments"]],
+                "title": kwargs["title"],
+                "url": kwargs.get("url", ""),
+                "parent_title": kwargs.get("parent_title", ""),
+                "recursive_docs": [],
+                "token_usage": {
+                    "input": sum(r["token_usage"]["input"] for r in chunk_results),
+                    "output": sum(r["token_usage"]["output"] for r in chunk_results),
+                    "total": sum(r["token_usage"]["total"] for r in chunk_results),
+                },
+            }
+            if as_pdf:
+                result["pdf_path"] = path
     if depth > 1:
-        new_docs = all_docs.copy()
-        for doc in all_docs:
+        recursive_docs = []
+        for segment in result["segments"]:
             urls = re.findall(
                 r'https?://[^\s<>"\']+|www\.[^\s<>"\']+(?:\.[^\s<>"\']+)*',
-                doc if raw else doc["content"],
+                segment["content"],
             )
             for url in urls:
                 if "](" in url:
@@ -182,19 +230,16 @@ def parse(
                 kwargs_cp = kwargs.copy()
                 kwargs_cp["depth"] = depth - 1
-                res = parse(
+                kwargs_cp["parent_title"] = result["title"]
+                sub_doc = parse(
                     url,
                     parser_type=parser_type,
-                    raw=raw,
                     pages_per_split=pages_per_split,
                     max_processes=max_processes,
                     **kwargs_cp,
                 )
+                recursive_docs.append(sub_doc)
-                if raw:
-                    new_docs.append(res)
-                else:
-                    new_docs.extend(res)
-        all_docs = new_docs
+        result["recursive_docs"] = recursive_docs
-    return "\n".join(all_docs) if raw else all_docs
+    return result

lexoid/core/parse_type/llm_parser.py CHANGED Viewed

@@ -18,6 +18,7 @@ from lexoid.core.prompt_templates import (
 from lexoid.core.utils import convert_image_to_pdf
 from loguru import logger
 from openai import OpenAI
+from together import Together
 from huggingface_hub import InferenceClient
@@ -33,38 +34,36 @@ def retry_on_http_error(func):
                 return func(*args, **kwargs)
             except HTTPError as e:
                 logger.error(f"Retry failed: {e}")
-                if kwargs.get("raw", False):
-                    return ""
-                return [
-                    {
-                        "metadata": {
-                            "title": kwargs["title"],
-                            "page": kwargs.get("start", 0),
-                        },
-                        "content": "",
-                    }
-                ]
+                return {
+                    "raw": "",
+                    "segments": [],
+                    "title": kwargs["title"],
+                    "url": kwargs.get("url", ""),
+                    "parent_title": kwargs.get("parent_title", ""),
+                    "recursive_docs": [],
+                    "error": f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}",
+                }
     return wrapper
 @retry_on_http_error
-def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
+def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
     if "model" not in kwargs:
         kwargs["model"] = "gemini-1.5-flash"
     model = kwargs.get("model")
     if model.startswith("gemini"):
-        return parse_with_gemini(path, raw, **kwargs)
+        return parse_with_gemini(path, **kwargs)
     if model.startswith("gpt"):
-        return parse_with_api(path, raw, api="openai", **kwargs)
+        return parse_with_api(path, api="openai", **kwargs)
     if model.startswith("meta-llama"):
         if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
-            return parse_with_together(path, raw, **kwargs)
-        return parse_with_api(path, raw, api="huggingface", **kwargs)
+            return parse_with_api(path, api="together", **kwargs)
+        return parse_with_api(path, api="huggingface", **kwargs)
     raise ValueError(f"Unsupported model: {model}")
-def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
+def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
     api_key = os.environ.get("GOOGLE_API_KEY")
     if not api_key:
         raise ValueError("GOOGLE_API_KEY environment variable is not set")
@@ -119,25 +118,33 @@ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
         if "text" in part
     )
-    result = ""
+    combined_text = ""
     if "<output>" in raw_text:
-        result = raw_text.split("<output>")[1].strip()
+        combined_text = raw_text.split("<output>")[1].strip()
     if "</output>" in result:
-        result = result.split("</output>")[0].strip()
-    if raw:
-        return result
-    return [
-        {
-            "metadata": {
-                "title": kwargs["title"],
-                "page": kwargs.get("start", 0) + page_no,
-            },
-            "content": page,
-        }
-        for page_no, page in enumerate(result.split("<page-break>"), start=1)
-    ]
+        combined_text = result.split("</output>")[0].strip()
+    token_usage = result["usageMetadata"]
+    input_tokens = token_usage.get("promptTokenCount", 0)
+    output_tokens = token_usage.get("candidatesTokenCount", 0)
+    total_tokens = input_tokens + output_tokens
+    return {
+        "raw": combined_text,
+        "segments": [
+            {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
+            for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
+        ],
+        "title": kwargs["title"],
+        "url": kwargs.get("url", ""),
+        "parent_title": kwargs.get("parent_title", ""),
+        "recursive_docs": [],
+        "token_usage": {
+            "input": input_tokens,
+            "output": output_tokens,
+            "total": total_tokens,
+        },
+    }
 def convert_pdf_page_to_base64(
@@ -155,97 +162,17 @@ def convert_pdf_page_to_base64(
     return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
-def parse_with_together(path: str, raw: bool, **kwargs) -> List[Dict] | str:
-    api_key = os.environ.get("TOGETHER_API_KEY")
-    if not api_key:
-        raise ValueError("TOGETHER_API_KEY environment variable is not set")
-    url = "https://api.together.xyz/v1/chat/completions"
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
-    mime_type, _ = mimetypes.guess_type(path)
-    if mime_type and mime_type.startswith("image"):
-        with open(path, "rb") as img_file:
-            image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
-            images = [(0, f"data:{mime_type};base64,{image_base64}")]
-    else:
-        pdf_document = pdfium.PdfDocument(path)
-        images = [
-            (
-                page_num,
-                f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
-            )
-            for page_num in range(len(pdf_document))
-        ]
-    all_results = []
-    for page_num, image_url in images:
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": LLAMA_PARSER_PROMPT},
-                    {"type": "image_url", "image_url": {"url": image_url}},
-                ],
-            }
-        ]
-        payload = {
-            "model": kwargs["model"],
-            "messages": messages,
-            "max_tokens": kwargs.get("max_tokens", 1024),
-            "temperature": kwargs.get("temperature", 0.7),
-        }
-        response = requests.post(url, json=payload, headers=headers)
-        response.raise_for_status()
-        response_data = response.json()
-        page_text = response_data["choices"][0]["message"]["content"]
-        if kwargs.get("verbose", None):
-            logger.debug(f"Page {page_num + 1} response: {page_text}")
-        result = page_text
-        if "<output>" in page_text:
-            result = page_text.split("<output>")[1].strip()
-        if "</output>" in result:
-            result = result.split("</output>")[0].strip()
-        all_results.append((page_num, result))
-    all_results.sort(key=lambda x: x[0])
-    all_texts = [text for _, text in all_results]
-    combined_text = "<page-break>".join(all_texts)
-    if raw:
-        return combined_text
-    return [
-        {
-            "metadata": {
-                "title": kwargs["title"],
-                "page": kwargs.get("start", 0) + page_no,
-            },
-            "content": page,
-        }
-        for page_no, page in enumerate(all_texts, start=1)
-    ]
-def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
+def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
     """
     Parse documents (PDFs or images) using various vision model APIs.
     Args:
         path (str): Path to the document to parse
-        raw (bool): If True, return raw text; if False, return structured data
-        api (str): Which API to use ("openai" or "huggingface")
+        api (str): Which API to use ("openai", "huggingface", or "together")
         **kwargs: Additional arguments including model, temperature, title, etc.
     Returns:
-        List[Dict] | str: Parsed content either as raw text or structured data
+        Dict: Dictionary containing parsed document data
     """
     # Initialize appropriate client
     clients = {
@@ -253,6 +180,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
         "huggingface": lambda: InferenceClient(
             token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
         ),
+        "together": lambda: Together(),
     }
     assert api in clients, f"Unsupported API: {api}"
     logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
@@ -329,6 +257,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
         # Get completion from selected API
         response = client.chat.completions.create(**completion_params)
+        token_usage = response.usage
         # Extract the response text
         page_text = response.choices[0].message.content
@@ -341,23 +270,44 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
             result = page_text.split("<output>")[1].strip()
         if "</output>" in result:
             result = result.split("</output>")[0].strip()
-        all_results.append((page_num, result))
+        all_results.append(
+            (
+                page_num,
+                result,
+                token_usage.prompt_tokens,
+                token_usage.completion_tokens,
+                token_usage.total_tokens,
+            )
+        )
     # Sort results by page number and combine
     all_results.sort(key=lambda x: x[0])
-    all_texts = [text for _, text in all_results]
+    all_texts = [text for _, text, _, _, _ in all_results]
     combined_text = "<page-break>".join(all_texts)
-    if raw:
-        return combined_text
-    return [
-        {
-            "metadata": {
-                "title": kwargs["title"],
-                "page": kwargs.get("start", 0) + page_no,
-            },
-            "content": page,
-        }
-        for page_no, page in enumerate(all_texts, start=1)
-    ]
+    return {
+        "raw": combined_text,
+        "segments": [
+            {
+                "metadata": {
+                    "page": kwargs.get("start", 0) + page_no + 1,
+                    "token_usage": {
+                        "input": input_tokens,
+                        "output": output_tokens,
+                        "total": total_tokens,
+                    },
+                },
+                "content": page,
+            }
+            for page_no, page, input_tokens, output_tokens, total_tokens in all_results
+        ],
+        "title": kwargs["title"],
+        "url": kwargs.get("url", ""),
+        "parent_title": kwargs.get("parent_title", ""),
+        "recursive_docs": [],
+        "token_usage": {
+            "input": sum(input_tokens for _, _, input_tokens, _, _ in all_results),
+            "output": sum(output_tokens for _, _, _, output_tokens, _ in all_results),
+            "total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
+        },
+    }

lexoid/core/parse_type/static_parser.py CHANGED Viewed

@@ -9,73 +9,89 @@ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
 from docx import Document
-def parse_static_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
+def parse_static_doc(path: str, **kwargs) -> Dict:
+    """
+    Parses a document using static parsing methods.
+    Args:
+        path (str): The file path.
+        **kwargs: Additional arguments for parsing.
+    Returns:
+        Dict: Dictionary containing parsed document data
+    """
     framework = kwargs.get("framework", "pdfplumber")
     file_type = get_file_type(path)
     if file_type == "application/pdf":
         if framework == "pdfplumber":
-            return parse_with_pdfplumber(path, raw, **kwargs)
+            return parse_with_pdfplumber(path, **kwargs)
         elif framework == "pdfminer":
-            return parse_with_pdfminer(path, raw, **kwargs)
+            return parse_with_pdfminer(path, **kwargs)
         else:
             raise ValueError(f"Unsupported framework: {framework}")
     elif "wordprocessing" in file_type:
-        return parse_with_docx(path, raw, **kwargs)
+        return parse_with_docx(path, **kwargs)
     elif file_type == "text/html":
         with open(path, "r") as f:
             html_content = f.read()
-            return html_to_markdown(html_content, raw, kwargs["title"])
+            return html_to_markdown(html_content, kwargs["title"])
     elif file_type == "text/plain":
         with open(path, "r") as f:
             content = f.read()
-            if raw:
-                return content
-            else:
-                return [
-                    {
-                        "metadata": {"title": kwargs["title"], "page": 1},
-                        "content": content,
-                    }
-                ]
+            return {
+                "raw": content,
+                "segments": [{"metadata": {"page": 1}, "content": content}],
+                "title": kwargs["title"],
+                "url": kwargs.get("url", ""),
+                "parent_title": kwargs.get("parent_title", ""),
+                "recursive_docs": [],
+            }
     elif file_type == "text/csv":
         df = pd.read_csv(path)
         content = df.to_markdown(index=False)
-        if raw:
-            return content
-        else:
-            return [
-                {
-                    "metadata": {"title": kwargs["title"], "page": 1},
-                    "content": content,
-                }
-            ]
+        return {
+            "raw": content,
+            "segments": [{"metadata": {"page": 1}, "content": content}],
+            "title": kwargs["title"],
+            "url": kwargs.get("url", ""),
+            "parent_title": kwargs.get("parent_title", ""),
+            "recursive_docs": [],
+        }
     else:
         raise ValueError(f"Unsupported file type: {file_type}")
-def parse_with_pdfminer(path: str, raw: bool, **kwargs) -> List[Dict] | str:
+def parse_with_pdfminer(path: str, **kwargs) -> Dict:
+    """
+    Parse PDF using pdfminer.
+    Returns:
+        Dict: Dictionary containing parsed document data
+    """
     pages = list(extract_pages(path))
-    docs = []
+    segments = []
+    raw_texts = []
     for page_num, page_layout in enumerate(pages, start=1):
         page_text = "".join(
             element.get_text()
             for element in page_layout
             if isinstance(element, LTTextContainer)
         )
-        if raw:
-            docs.append(page_text)
-        else:
-            docs.append(
-                {
-                    "metadata": {
-                        "title": kwargs["title"],
-                        "page": kwargs["start"] + page_num,
-                    },
-                    "content": page_text,
-                }
-            )
-    return "\n".join(docs) if raw else docs
+        raw_texts.append(page_text)
+        segments.append(
+            {"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
+        )
+    return {
+        "raw": "\n".join(raw_texts),
+        "segments": segments,
+        "title": kwargs["title"],
+        "url": kwargs.get("url", ""),
+        "parent_title": kwargs.get("parent_title", ""),
+        "recursive_docs": [],
+    }
 def process_table(table) -> str:
@@ -359,44 +375,44 @@ def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
     return page_texts
-def parse_with_pdfplumber(path: str, raw: bool, **kwargs) -> List[Dict] | str:
+def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
     """
-    Parse PDF and return either raw text or structured data.
-    Args:
-        path (str): Path to the PDF file
-        raw (bool): If True, return raw text with page breaks; if False, return structured data
-        **kwargs: Additional arguments including 'title' and 'start' page number
+    Parse PDF using pdfplumber.
     Returns:
-        Union[List[Dict], str]: Either a list of dictionaries containing page metadata and content,
-                               or a string of raw text with page breaks
+        Dict: Dictionary containing parsed document data
     """
     page_texts = process_pdf_with_pdfplumber(path)
-    if raw:
-        return "<page-break>".join(page_texts)
-    return [
-        {
-            "metadata": {"title": kwargs["title"], "page": kwargs["start"] + page_num},
-            "content": page_text,
-        }
+    segments = [
+        {"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
         for page_num, page_text in enumerate(page_texts, start=1)
     ]
+    return {
+        "raw": "<page-break>".join(page_texts),
+        "segments": segments,
+        "title": kwargs["title"],
+        "url": kwargs.get("url", ""),
+        "parent_title": kwargs.get("parent_title", ""),
+        "recursive_docs": [],
+    }
-def parse_with_docx(path: str, raw: bool, **kwargs) -> List[Dict] | str:
+def parse_with_docx(path: str, **kwargs) -> Dict:
+    """
+    Parse DOCX document.
+    Returns:
+        Dict: Dictionary containing parsed document data
+    """
     doc = Document(path)
     full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
-    if raw:
-        return full_text
-    return [
-        {
-            "metadata": {
-                "title": kwargs["title"],
-                "page": kwargs["start"] + 1,
-            },
-            "content": full_text,
-        }
-    ]
+    return {
+        "raw": full_text,
+        "segments": [{"metadata": {"page": kwargs["start"] + 1}, "content": full_text}],
+        "title": kwargs["title"],
+        "url": kwargs.get("url", ""),
+        "parent_title": kwargs.get("parent_title", ""),
+        "recursive_docs": [],
+    }

lexoid/core/utils.py CHANGED Viewed

@@ -5,7 +5,8 @@ import os
 import re
 import sys
 from difflib import SequenceMatcher
-from typing import Dict, List, Union
+from hashlib import md5
+from typing import Dict, List, Optional
 from urllib.parse import urlparse
 import nest_asyncio
@@ -44,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
     return paths
+def create_sub_pdf(
+    input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
+) -> str:
+    if isinstance(page_nums, int):
+        page_nums = (page_nums,)
+    page_nums = tuple(sorted(set(page_nums)))
+    with pikepdf.open(input_path) as pdf:
+        indices = page_nums if page_nums else range(len(pdf.pages))
+        with pikepdf.new() as new_pdf:
+            new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
+            new_pdf.save(output_path)
+    return output_path
 def convert_image_to_pdf(image_path: str) -> bytes:
     with Image.open(image_path) as img:
         img_rgb = img.convert("RGB")
@@ -184,14 +199,11 @@ def find_dominant_heading_level(markdown_content: str) -> str:
     return min(heading_counts.keys(), key=len)
-def split_md_by_headings(
-    markdown_content: str, heading_pattern: str, title: str
-) -> List[Dict]:
+def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Dict]:
     """
     Splits markdown content by the specified heading pattern and structures it.
     Args:
-        url (str): The URL of the HTML page
         markdown_content (str): The markdown content to split
         heading_pattern (str): The heading pattern to split on (e.g., '##' or 'underline')
@@ -211,7 +223,7 @@ def split_md_by_headings(
         if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
             structured_content.append(
                 {
-                    "metadata": {"title": title, "page": "Introduction"},
+                    "metadata": {"page": "Introduction"},
                     "content": sections.pop(0),
                 }
             )
@@ -221,7 +233,7 @@ def split_md_by_headings(
             if i + 1 < len(sections):
                 structured_content.append(
                     {
-                        "metadata": {"title": title, "page": sections[i]},
+                        "metadata": {"page": sections[i]},
                         "content": sections[i + 1],
                     }
                 )
@@ -238,7 +250,7 @@ def split_md_by_headings(
         if len(sections) > len(headings):
             structured_content.append(
                 {
-                    "metadata": {"title": title, "page": "Introduction"},
+                    "metadata": {"page": "Introduction"},
                     "content": sections.pop(0),
                 }
             )
@@ -248,7 +260,7 @@ def split_md_by_headings(
             clean_heading = heading.replace(heading_pattern, "").strip()
             structured_content.append(
                 {
-                    "metadata": {"title": title, "page": clean_heading},
+                    "metadata": {"page": clean_heading},
                     "content": content,
                 }
             )
@@ -256,39 +268,47 @@ def split_md_by_headings(
     return structured_content
-def html_to_markdown(html: str, raw: bool, title: str) -> str:
+def html_to_markdown(html: str, title: str, url: str) -> str:
     """
     Converts HTML content to markdown.
     Args:
         html (str): The HTML content to convert.
-        raw (bool): Whether to return raw markdown text or structured data.
+        title (str): The title of the HTML page
+        url (str): The URL of the HTML page
     Returns:
-        Union[str, List[Dict]]: Either raw markdown content or structured data with metadata and content sections.
+        Dict: Dictionary containing parsed document data
     """
     markdown_content = md(html)
-    if raw:
-        return markdown_content
     # Find the dominant heading level
     heading_pattern = find_dominant_heading_level(markdown_content)
     # Split content by headings and structure it
-    return split_md_by_headings(markdown_content, heading_pattern, title)
+    split_md = split_md_by_headings(markdown_content, heading_pattern)
+    content = {
+        "raw": markdown_content,
+        "segments": split_md,
+        "title": title,
+        "url": url,
+        "parent_title": "",
+        "recursive_docs": [],
+    }
+    return content
-def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
+def read_html_content(url: str) -> Dict:
     """
     Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
     Args:
         url (str): The URL of the HTML page.
-        raw (bool): Whether to return raw markdown text or structured data.
     Returns:
-        Union[str, List[Dict]]: Either raw markdown content or structured data with metadata and content sections.
+        Dict: Dictionary containing parsed document data
     """
     try:
@@ -351,7 +371,10 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
         soup = BeautifulSoup(
             response.content, "html.parser", from_encoding="iso-8859-1"
         )
-    return html_to_markdown(str(soup), raw, title=url)
+    title = soup.title.string.strip() if soup.title else "No title"
+    url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
+    full_title = f"{title} - {url_hash}"
+    return html_to_markdown(str(soup), title=full_title, url=url)
 def extract_urls_from_markdown(content: str) -> List[str]:
@@ -378,61 +401,60 @@ def extract_urls_from_markdown(content: str) -> List[str]:
     return list(set(urls))  # Remove duplicates
-def recursive_read_html(
-    url: str, depth: int, raw: bool, visited_urls: set = None
-) -> Union[str, List[Dict]]:
+def recursive_read_html(url: str, depth: int, visited_urls: set = None) -> Dict:
     """
     Recursively reads HTML content from URLs up to specified depth.
     Args:
         url (str): The URL to parse
         depth (int): How many levels deep to recursively parse
-        raw (bool): Whether to return raw text or structured data
         visited_urls (set): Set of already visited URLs to prevent cycles
     Returns:
-        Union[str, List[Dict]]: Combined content from all parsed URLs
+        Dict: Dictionary containing parsed document data
     """
     if visited_urls is None:
         visited_urls = set()
     if url in visited_urls:
-        return "" if raw else []
+        return {
+            "raw": "",
+            "segments": [],
+            "title": "",
+            "url": url,
+            "parent_title": "",
+            "recursive_docs": [],
+        }
     visited_urls.add(url)
     try:
-        content = read_html_content(url, raw)
+        content = read_html_content(url)
     except Exception as e:
         print(f"Error processing URL {url}: {str(e)}")
-        return "" if raw else []
+        return {
+            "raw": "",
+            "segments": [],
+            "title": "",
+            "url": url,
+            "parent_title": "",
+            "recursive_docs": [],
+        }
     if depth <= 1:
         return content
-    # Extract URLs from the content
-    if raw:
-        urls = extract_urls_from_markdown(content)
-    else:
-        # Extract URLs from all content sections
-        urls = []
-        for doc in content:
-            urls.extend(extract_urls_from_markdown(doc["content"]))
+    # Extract URLs from all content sections
+    urls = extract_urls_from_markdown(content["raw"])
     # Recursively process each URL
+    recursive_docs = []
     for sub_url in urls:
         if sub_url not in visited_urls:
-            sub_content = recursive_read_html(sub_url, depth - 1, raw, visited_urls)
-            if raw:
-                if sub_content:
-                    content += f"\n\n--- Begin content from {sub_url} ---\n\n"
-                    content += sub_content
-                    content += f"\n\n--- End content from {sub_url} ---\n\n"
-            else:
-                if isinstance(sub_content, list):
-                    content.extend(sub_content)
+            sub_content = recursive_read_html(sub_url, depth - 1, visited_urls)
+            recursive_docs.append(sub_content)
+    content["recursive_docs"] = recursive_docs
     return content

{lexoid-0.1.8.post1.dist-info → lexoid-0.1.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lexoid
-Version: 0.1.8.post1
+Version: 0.1.10
 Summary:
 Requires-Python: >=3.10,<4.0
 Classifier: Programming Language :: Python :: 3
@@ -28,6 +28,7 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
 Requires-Dist: python-docx (>=1.1.2,<2.0.0)
 Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
+Requires-Dist: together (>=1.4.0,<2.0.0)
 Description-Content-Type: text/markdown
 # Lexoid
@@ -93,10 +94,10 @@ Here's a quick example to parse documents using Lexoid:
 from lexoid.api import parse
 from lexoid.api import ParserType
-parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE", raw=True)
+parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE")["raw"]
 # or
 pdf_path = "path/to/immigration-law-advisor.pdf"
-parsed_md = parse(pdf_path, parser_type="LLM_PARSE", raw=True)
+parsed_md = parse(pdf_path, parser_type="LLM_PARSE")["raw"]
 print(parsed_md)
 ```
@@ -104,7 +105,6 @@ print(parsed_md)
 ### Parameters
 - path (str): The file path or URL.
 - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
-- raw (bool, optional): Return raw text or structured data. Defaults to False.
 - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
 - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
 - **kwargs: Additional arguments for the parser.

lexoid-0.1.10.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+lexoid/api.py,sha256=45nkTuQcxdppeUiRsiyioJtvlVeWeoq_WgKtGCthIBY,9193
+lexoid/core/parse_type/llm_parser.py,sha256=tH19B0w78OowkDdqJg3rom0kQmyuTaTfDP98Qnwufo0,10625
+lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
+lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
+lexoid/core/utils.py,sha256=HT37qmdhPpUNN6O571G7ItE5K2Mv8SreBHmxrhdiXA8,18951
+lexoid-0.1.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+lexoid-0.1.10.dist-info/METADATA,sha256=4uhJ_IaHEKPl9lxKg8RRrBQ5dn7oB23XCnJNG5sNpH4,4576
+lexoid-0.1.10.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+lexoid-0.1.10.dist-info/RECORD,,

lexoid-0.1.8.post1.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
-lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
-lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
-lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
-lexoid/core/utils.py,sha256=peWuMVTk90-j0aSDaRnwigpoAz_Q5y8vSosCDc6Zl3g,18642
-lexoid-0.1.8.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-lexoid-0.1.8.post1.dist-info/METADATA,sha256=mz8A_92-GrLfOmT8UYcIxWIEkcskad_9vSnNnlbE4dI,4625
-lexoid-0.1.8.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-lexoid-0.1.8.post1.dist-info/RECORD,,

{lexoid-0.1.8.post1.dist-info → lexoid-0.1.10.dist-info}/LICENSE RENAMED Viewed

File without changes

{lexoid-0.1.8.post1.dist-info → lexoid-0.1.10.dist-info}/WHEEL RENAMED Viewed

File without changes

lexoid 0.1.8.post1__py3-none-any.whl → 0.1.10__py3-none-any.whl

lexoid 0.1.8.post1py3-none-any.whl → 0.1.10py3-none-any.whl