content-core 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

content_core/__init__.py CHANGED
@@ -214,5 +214,3 @@ def csum():
214
214
 
215
215
  if __name__ == "__main__":
216
216
  ccore()
217
- if __name__ == "__main__":
218
- ccore()
@@ -13,10 +13,17 @@ from content_core.logging import logger
13
13
 
14
14
  class FileDetector:
15
15
  """Pure Python file type detection using magic bytes and content analysis."""
16
-
17
- # Configuration constants
16
+
17
+ # Configuration constants for binary/text detection
18
18
  SIGNATURE_READ_SIZE = 512 # Bytes to read for binary signature detection
19
19
  TEXT_READ_SIZE = 1024 # Bytes to read for text content analysis
20
+
21
+ # Configuration constants for CSV detection
22
+ CSV_MAX_FIELD_LENGTH = 100 # Maximum average field length for CSV (longer suggests prose)
23
+ CSV_MAX_VARIANCE = 500 # Maximum variance in field lengths (higher suggests natural text)
24
+ CSV_MIN_SCORE = 2 # Minimum score required to classify as CSV
25
+ CSV_MIN_FIELDS = 2 # Minimum number of fields required for CSV
26
+ CSV_MAX_HEADER_FIELD_LENGTH = 50 # Maximum length for individual header fields
20
27
 
21
28
  def __init__(self):
22
29
  """Initialize the FileDetector with signature mappings."""
@@ -364,18 +371,102 @@ class FileDetector:
364
371
 
365
372
 
366
373
  def _looks_like_csv(self, content: str) -> bool:
367
- """Check if content looks like CSV format."""
368
- lines = content.split('\n', 5)[:5] # Check first 5 lines
369
- if len(lines) < 2:
374
+ """
375
+ Check if content looks like CSV format with improved heuristics.
376
+
377
+ Uses a multi-stage approach with performance optimization:
378
+ 1. Basic structural checks (cheap)
379
+ 2. Field length analysis (cheap, early exit)
380
+ 3. Pattern matching (moderate cost)
381
+ 4. Variance analysis (expensive, only if needed)
382
+ """
383
+ lines = content.split('\n', 10)[:10] # Check first 10 lines for better accuracy
384
+ non_empty_lines = [line for line in lines if line.strip()]
385
+
386
+ # Stage 1: Basic structural checks
387
+ if len(non_empty_lines) < 2:
370
388
  return False
371
-
389
+
372
390
  # Count commas in each line
373
- comma_counts = [line.count(',') for line in lines if line.strip()]
374
- if not comma_counts:
391
+ comma_counts = [line.count(',') for line in non_empty_lines]
392
+
393
+ # Must have at least one comma per line
394
+ if not all(count > 0 for count in comma_counts):
375
395
  return False
376
-
377
- # CSV should have consistent comma counts
378
- return len(set(comma_counts)) == 1 and comma_counts[0] > 0
396
+
397
+ # CSV should have consistent comma counts across lines
398
+ if len(set(comma_counts)) != 1:
399
+ return False
400
+
401
+ num_fields = comma_counts[0] + 1 # Number of fields = commas + 1
402
+
403
+ # Must have minimum number of fields to be CSV
404
+ if num_fields < self.CSV_MIN_FIELDS:
405
+ return False
406
+
407
+ # Stage 2: Field length analysis (PERFORMANCE OPTIMIZATION: early exit)
408
+ first_line = non_empty_lines[0]
409
+ fields = first_line.split(',')
410
+
411
+ # CSV fields should be relatively short (not long sentences)
412
+ # Average field length should be reasonable (not paragraphs)
413
+ # Early exit avoids expensive variance calculations for obvious prose
414
+ avg_field_length = sum(len(f.strip()) for f in fields) / len(fields)
415
+ if avg_field_length > self.CSV_MAX_FIELD_LENGTH:
416
+ return False # Too long to be typical CSV fields - exit early
417
+
418
+ # Stage 3: Pattern matching
419
+ # Check for CSV-like patterns:
420
+ # 1. Fields that look like headers (short, alphanumeric)
421
+ # 2. Quoted fields (common in CSV)
422
+ # 3. Numeric fields
423
+ has_quoted_fields = any('"' in line or "'" in line for line in non_empty_lines[:3])
424
+
425
+ first_line_fields = [f.strip() for f in fields]
426
+ # Check if first line looks like a header (short, no sentence-ending punctuation)
427
+ looks_like_header = all(
428
+ len(f) < self.CSV_MAX_HEADER_FIELD_LENGTH and not f.endswith('.') and not f.endswith('!')
429
+ for f in first_line_fields
430
+ )
431
+
432
+ # Stage 4: Variance analysis (EXPENSIVE - only if we have enough data)
433
+ # Check if subsequent lines have similar field structure
434
+ # Real CSV tends to have consistent field lengths
435
+ if len(non_empty_lines) >= 3:
436
+ field_lengths_per_line = []
437
+ for line in non_empty_lines[:5]:
438
+ line_fields = line.split(',')
439
+ field_lengths = [len(f.strip()) for f in line_fields]
440
+ field_lengths_per_line.append(field_lengths)
441
+
442
+ # Calculate variance in field positions
443
+ # CSV data should have relatively consistent field lengths at each position
444
+ # Natural text with commas will have much more variance
445
+ position_variances = []
446
+ for i in range(num_fields):
447
+ lengths_at_position = [fl[i] if i < len(fl) else 0 for fl in field_lengths_per_line]
448
+ if lengths_at_position:
449
+ avg = sum(lengths_at_position) / len(lengths_at_position)
450
+ variance = sum((x - avg) ** 2 for x in lengths_at_position) / len(lengths_at_position)
451
+ position_variances.append(variance)
452
+
453
+ # High variance suggests natural text, not structured CSV
454
+ if position_variances:
455
+ avg_variance = sum(position_variances) / len(position_variances)
456
+ if avg_variance > self.CSV_MAX_VARIANCE:
457
+ return False # Very high variance = likely prose
458
+
459
+ # Scoring: Require at least some CSV-like characteristics
460
+ csv_score = 0
461
+ if looks_like_header:
462
+ csv_score += 1
463
+ if has_quoted_fields:
464
+ csv_score += 1
465
+ if num_fields >= 3: # Multiple fields is more CSV-like
466
+ csv_score += 1
467
+
468
+ # Need minimum score to confidently classify as CSV
469
+ return csv_score >= self.CSV_MIN_SCORE
379
470
 
380
471
 
381
472
  def _is_text_file(self, content: str) -> bool:
@@ -45,7 +45,7 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
45
45
  end_time = min((i + 1) * segment_length_s, audio.duration)
46
46
 
47
47
  # Extract segment
48
- output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
48
+ output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
49
49
  output_path = os.path.join(output_dir, output_filename)
50
50
 
51
51
  # Export segment
@@ -53,7 +53,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
53
53
 
54
54
  output_files.append(output_path)
55
55
 
56
- logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
56
+ logger.debug(
57
+ f"Exported segment {i + 1}/{total_segments}: {output_filename}"
58
+ )
57
59
 
58
60
  return output_files
59
61
 
@@ -172,7 +174,7 @@ async def extract_audio_data(data: ProcessSourceState):
172
174
  end_time = min((i + 1) * segment_length_s, audio.duration)
173
175
 
174
176
  # Extract segment
175
- output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
177
+ output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
176
178
  output_path = os.path.join(output_dir, output_filename)
177
179
 
178
180
  extract_audio(input_audio_path, output_path, start_time, end_time)
@@ -193,7 +195,9 @@ async def extract_audio_data(data: ProcessSourceState):
193
195
  concurrency = get_audio_concurrency()
194
196
  semaphore = asyncio.Semaphore(concurrency)
195
197
 
196
- logger.debug(f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}")
198
+ logger.debug(
199
+ f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}"
200
+ )
197
201
 
198
202
  # Create tasks for parallel transcription
199
203
  transcription_tasks = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.5.0
3
+ Version: 1.6.0
4
4
  Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -1,4 +1,4 @@
1
- content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
1
+ content_core/__init__.py,sha256=c1qw2HyV8DqBnoB9fbXNIZtda60MrxGUvWG5EQFtr-8,6492
2
2
  content_core/cc_config.yaml,sha256=3Ot5u-YSBx2k3JXWnCP7s7OVBbGpGebBy_CWj3we-u4,1211
3
3
  content_core/config.py,sha256=Mao6AZZoiSiX7uZwOGgk759LlV0j6NdfYGgWgX6vhAs,7112
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
@@ -17,14 +17,14 @@ content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnR
17
17
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
18
18
  content_core/content/extraction/graph.py,sha256=AFi9B_hTuxqdgvogCOk4Xdqoboug7_KXtV0ZHlb8igM,8139
19
19
  content_core/content/identification/__init__.py,sha256=DDoCi1r-6Z_pGPPi3X1ZwyRrcRtg-rAiCTK50hnO5Y0,235
20
- content_core/content/identification/file_detector.py,sha256=JTfGK28BQg_SGYqLzGVT4OGBfWx8HtEPA-3kfW5o3oE,17153
20
+ content_core/content/identification/file_detector.py,sha256=GBP1cCNc1qnt5_HkE5ALQDja2f8WgqCC5h1r8xF0P7A,21480
21
21
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
22
22
  content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
23
23
  content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
24
24
  content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
25
25
  content_core/notebooks/run.ipynb,sha256=8gbFln9WLrli_qWJB8SKQKcSNbAv25DvN5Cu4EAAeBQ,370952
26
26
  content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
27
- content_core/processors/audio.py,sha256=fdR_KcLRG3jSwY3t_eVDoMgUHQQyXmAAlmfETMtomq0,8396
27
+ content_core/processors/audio.py,sha256=CYwoTDPsVUDALHuz_EHcnjVfsKF8XjQmvmX8c-OmMNU,8462
28
28
  content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
29
29
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
30
30
  content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
@@ -36,8 +36,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
36
36
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
37
37
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
38
38
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
39
- content_core-1.5.0.dist-info/METADATA,sha256=D3Cuy_zwW7u6jeuDVxYCwSEzJt8yrIjEFi9bJhJPqLQ,21963
40
- content_core-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
- content_core-1.5.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
42
- content_core-1.5.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
43
- content_core-1.5.0.dist-info/RECORD,,
39
+ content_core-1.6.0.dist-info/METADATA,sha256=bBxEINm9h2ppJIia11flDRDH7UshzamVrHKHGxHrmjs,21963
40
+ content_core-1.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
+ content_core-1.6.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
42
+ content_core-1.6.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
43
+ content_core-1.6.0.dist-info/RECORD,,