PyPI - content-core - Versions diffs - 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

content-core 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (10) hide show

content_core/__init__.py CHANGED Viewed

@@ -214,5 +214,3 @@ def csum():
 if __name__ == "__main__":
     ccore()
-if __name__ == "__main__":
-    ccore()

content_core/common/state.py CHANGED Viewed

@@ -27,6 +27,14 @@ class ProcessSourceState(BaseModel):
         default=None,
         description="Override Docling output format: 'markdown', 'html', or 'json'",
     )
+    audio_provider: Optional[str] = Field(
+        default=None,
+        description="Override speech-to-text provider (e.g., 'openai', 'google')",
+    )
+    audio_model: Optional[str] = Field(
+        default=None,
+        description="Override speech-to-text model name (e.g., 'whisper-1', 'chirp')",
+    )
 class ProcessSourceInput(BaseModel):
@@ -36,6 +44,8 @@ class ProcessSourceInput(BaseModel):
     document_engine: Optional[str] = None
     url_engine: Optional[str] = None
     output_format: Optional[str] = None
+    audio_provider: Optional[str] = None
+    audio_model: Optional[str] = None
 class ProcessSourceOutput(BaseModel):

content_core/content/identification/file_detector.py CHANGED Viewed

@@ -13,10 +13,17 @@ from content_core.logging import logger
 class FileDetector:
     """Pure Python file type detection using magic bytes and content analysis."""
-    # Configuration constants
+    # Configuration constants for binary/text detection
     SIGNATURE_READ_SIZE = 512  # Bytes to read for binary signature detection
     TEXT_READ_SIZE = 1024      # Bytes to read for text content analysis
+    # Configuration constants for CSV detection
+    CSV_MAX_FIELD_LENGTH = 100  # Maximum average field length for CSV (longer suggests prose)
+    CSV_MAX_VARIANCE = 500      # Maximum variance in field lengths (higher suggests natural text)
+    CSV_MIN_SCORE = 2           # Minimum score required to classify as CSV
+    CSV_MIN_FIELDS = 2          # Minimum number of fields required for CSV
+    CSV_MAX_HEADER_FIELD_LENGTH = 50  # Maximum length for individual header fields
     def __init__(self):
         """Initialize the FileDetector with signature mappings."""
@@ -364,18 +371,102 @@ class FileDetector:
     def _looks_like_csv(self, content: str) -> bool:
-        """Check if content looks like CSV format."""
-        lines = content.split('\n', 5)[:5]  # Check first 5 lines
-        if len(lines) < 2:
+        """
+        Check if content looks like CSV format with improved heuristics.
+        Uses a multi-stage approach with performance optimization:
+        1. Basic structural checks (cheap)
+        2. Field length analysis (cheap, early exit)
+        3. Pattern matching (moderate cost)
+        4. Variance analysis (expensive, only if needed)
+        """
+        lines = content.split('\n', 10)[:10]  # Check first 10 lines for better accuracy
+        non_empty_lines = [line for line in lines if line.strip()]
+        # Stage 1: Basic structural checks
+        if len(non_empty_lines) < 2:
             return False
         # Count commas in each line
-        comma_counts = [line.count(',') for line in lines if line.strip()]
-        if not comma_counts:
+        comma_counts = [line.count(',') for line in non_empty_lines]
+        # Must have at least one comma per line
+        if not all(count > 0 for count in comma_counts):
             return False
-        # CSV should have consistent comma counts
-        return len(set(comma_counts)) == 1 and comma_counts[0] > 0
+        # CSV should have consistent comma counts across lines
+        if len(set(comma_counts)) != 1:
+            return False
+        num_fields = comma_counts[0] + 1  # Number of fields = commas + 1
+        # Must have minimum number of fields to be CSV
+        if num_fields < self.CSV_MIN_FIELDS:
+            return False
+        # Stage 2: Field length analysis (PERFORMANCE OPTIMIZATION: early exit)
+        first_line = non_empty_lines[0]
+        fields = first_line.split(',')
+        # CSV fields should be relatively short (not long sentences)
+        # Average field length should be reasonable (not paragraphs)
+        # Early exit avoids expensive variance calculations for obvious prose
+        avg_field_length = sum(len(f.strip()) for f in fields) / len(fields)
+        if avg_field_length > self.CSV_MAX_FIELD_LENGTH:
+            return False  # Too long to be typical CSV fields - exit early
+        # Stage 3: Pattern matching
+        # Check for CSV-like patterns:
+        # 1. Fields that look like headers (short, alphanumeric)
+        # 2. Quoted fields (common in CSV)
+        # 3. Numeric fields
+        has_quoted_fields = any('"' in line or "'" in line for line in non_empty_lines[:3])
+        first_line_fields = [f.strip() for f in fields]
+        # Check if first line looks like a header (short, no sentence-ending punctuation)
+        looks_like_header = all(
+            len(f) < self.CSV_MAX_HEADER_FIELD_LENGTH and not f.endswith('.') and not f.endswith('!')
+            for f in first_line_fields
+        )
+        # Stage 4: Variance analysis (EXPENSIVE - only if we have enough data)
+        # Check if subsequent lines have similar field structure
+        # Real CSV tends to have consistent field lengths
+        if len(non_empty_lines) >= 3:
+            field_lengths_per_line = []
+            for line in non_empty_lines[:5]:
+                line_fields = line.split(',')
+                field_lengths = [len(f.strip()) for f in line_fields]
+                field_lengths_per_line.append(field_lengths)
+            # Calculate variance in field positions
+            # CSV data should have relatively consistent field lengths at each position
+            # Natural text with commas will have much more variance
+            position_variances = []
+            for i in range(num_fields):
+                lengths_at_position = [fl[i] if i < len(fl) else 0 for fl in field_lengths_per_line]
+                if lengths_at_position:
+                    avg = sum(lengths_at_position) / len(lengths_at_position)
+                    variance = sum((x - avg) ** 2 for x in lengths_at_position) / len(lengths_at_position)
+                    position_variances.append(variance)
+            # High variance suggests natural text, not structured CSV
+            if position_variances:
+                avg_variance = sum(position_variances) / len(position_variances)
+                if avg_variance > self.CSV_MAX_VARIANCE:
+                    return False  # Very high variance = likely prose
+        # Scoring: Require at least some CSV-like characteristics
+        csv_score = 0
+        if looks_like_header:
+            csv_score += 1
+        if has_quoted_fields:
+            csv_score += 1
+        if num_fields >= 3:  # Multiple fields is more CSV-like
+            csv_score += 1
+        # Need minimum score to confidently classify as CSV
+        return csv_score >= self.CSV_MIN_SCORE
     def _is_text_file(self, content: str) -> bool:

content_core/processors/audio.py CHANGED Viewed

@@ -45,7 +45,7 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
             end_time = min((i + 1) * segment_length_s, audio.duration)
             # Extract segment
-            output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
+            output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
             output_path = os.path.join(output_dir, output_filename)
             # Export segment
@@ -53,7 +53,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
             output_files.append(output_path)
-            logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
+            logger.debug(
+                f"Exported segment {i + 1}/{total_segments}: {output_filename}"
+            )
         return output_files
@@ -172,7 +174,7 @@ async def extract_audio_data(data: ProcessSourceState):
                     end_time = min((i + 1) * segment_length_s, audio.duration)
                     # Extract segment
-                    output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
+                    output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
                     output_path = os.path.join(output_dir, output_filename)
                     extract_audio(input_audio_path, output_path, start_time, end_time)
@@ -188,12 +190,45 @@ async def extract_audio_data(data: ProcessSourceState):
             # Transcribe audio files in parallel with concurrency limit
             from content_core.models import ModelFactory
+            from esperanto import AIFactory
+            # Determine which model to use based on state parameters
+            if data.audio_provider and data.audio_model:
+                # Custom model provided - create new instance
+                try:
+                    logger.info(
+                        f"Using custom audio model: {data.audio_provider}/{data.audio_model}"
+                    )
+                    speech_to_text_model = AIFactory.create_speech_to_text(
+                        data.audio_provider, data.audio_model
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"Failed to create custom audio model '{data.audio_provider}/{data.audio_model}': {e}. "
+                        f"Check that the provider and model are supported by Esperanto. "
+                        f"Falling back to default model."
+                    )
+                    speech_to_text_model = ModelFactory.get_model("speech_to_text")
+            elif data.audio_provider or data.audio_model:
+                # Only one parameter provided - log warning and use default
+                missing = "audio_model" if data.audio_provider else "audio_provider"
+                provided = "audio_provider" if data.audio_provider else "audio_model"
+                logger.warning(
+                    f"{provided} provided without {missing}. "
+                    f"Both audio_provider and audio_model must be specified together. "
+                    f"Falling back to default model."
+                )
+                speech_to_text_model = ModelFactory.get_model("speech_to_text")
+            else:
+                # No custom parameters - use default (backward compatible)
+                speech_to_text_model = ModelFactory.get_model("speech_to_text")
-            speech_to_text_model = ModelFactory.get_model("speech_to_text")
             concurrency = get_audio_concurrency()
             semaphore = asyncio.Semaphore(concurrency)
-            logger.debug(f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}")
+            logger.debug(
+                f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}"
+            )
             # Create tasks for parallel transcription
             transcription_tasks = [

content_core/templated_message.py CHANGED Viewed

@@ -2,7 +2,6 @@ from typing import Dict, Optional, Union
 from ai_prompter import Prompter
 from esperanto import LanguageModel
-from esperanto.common_types import Message
 from pydantic import BaseModel, Field
 from content_core.models import ModelFactory

{content_core-1.5.0.dist-info → content_core-1.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 1.5.0
+Version: 1.7.0
 Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE
@@ -263,6 +263,14 @@ cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...
 # Summarize content with optional context
 summary = await cc.summarize_content("long article text", context="explain to a child")
+# Extract audio with custom speech-to-text model
+from content_core.common import ProcessSourceInput
+result = await cc.extract(ProcessSourceInput(
+    file_path="interview.mp3",
+    audio_provider="openai",
+    audio_model="whisper-1"
+))
 ```
 ## Documentation

{content_core-1.5.0.dist-info → content_core-1.7.0.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
-content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
+content_core/__init__.py,sha256=c1qw2HyV8DqBnoB9fbXNIZtda60MrxGUvWG5EQFtr-8,6492
 content_core/cc_config.yaml,sha256=3Ot5u-YSBx2k3JXWnCP7s7OVBbGpGebBy_CWj3we-u4,1211
 content_core/config.py,sha256=Mao6AZZoiSiX7uZwOGgk759LlV0j6NdfYGgWgX6vhAs,7112
 content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
 content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
 content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
 content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
-content_core/templated_message.py,sha256=jsjGqD-zf__pV4P0eo9cffTK2C90-VggL64qNYejFo0,1615
+content_core/templated_message.py,sha256=F4ysbVUWG1V3-pT8NYbCzP5mJN_qRYtiWPa9gxjB9v0,1572
 content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
 content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
-content_core/common/state.py,sha256=K5jsDg4l2GSaoGyFYzdd1GW14vLaAxdxes8vUrPNVkE,1622
+content_core/common/state.py,sha256=Xxxtdi650x4zkNX3yXA9Jx79GAzud7Vu-I7eNEjHlhI,2010
 content_core/common/types.py,sha256=DOQFW5ySHELc_mZU6G_7PUy1kmnP4aU4IpMyyXDQcBE,177
 content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
 content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeFiQM,230
@@ -17,14 +17,14 @@ content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnR
 content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
 content_core/content/extraction/graph.py,sha256=AFi9B_hTuxqdgvogCOk4Xdqoboug7_KXtV0ZHlb8igM,8139
 content_core/content/identification/__init__.py,sha256=DDoCi1r-6Z_pGPPi3X1ZwyRrcRtg-rAiCTK50hnO5Y0,235
-content_core/content/identification/file_detector.py,sha256=JTfGK28BQg_SGYqLzGVT4OGBfWx8HtEPA-3kfW5o3oE,17153
+content_core/content/identification/file_detector.py,sha256=GBP1cCNc1qnt5_HkE5ALQDja2f8WgqCC5h1r8xF0P7A,21480
 content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
 content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
 content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
 content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
 content_core/notebooks/run.ipynb,sha256=8gbFln9WLrli_qWJB8SKQKcSNbAv25DvN5Cu4EAAeBQ,370952
 content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
-content_core/processors/audio.py,sha256=fdR_KcLRG3jSwY3t_eVDoMgUHQQyXmAAlmfETMtomq0,8396
+content_core/processors/audio.py,sha256=h4aPff8WjDklE2iCviuAEEAYJTTxmWh9nOgMYJHWzmM,10202
 content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
 content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
 content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
@@ -36,8 +36,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
 content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
 content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
 content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
-content_core-1.5.0.dist-info/METADATA,sha256=D3Cuy_zwW7u6jeuDVxYCwSEzJt8yrIjEFi9bJhJPqLQ,21963
-content_core-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-content_core-1.5.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
-content_core-1.5.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
-content_core-1.5.0.dist-info/RECORD,,
+content_core-1.7.0.dist-info/METADATA,sha256=l3oDAdfN_gMFfOgHz3fELrjSxUXq8AKRKbC5uVF6mzM,22201
+content_core-1.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+content_core-1.7.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
+content_core-1.7.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
+content_core-1.7.0.dist-info/RECORD,,

{content_core-1.5.0.dist-info → content_core-1.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_core-1.5.0.dist-info → content_core-1.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{content_core-1.5.0.dist-info → content_core-1.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

content-core 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

content-core 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl