content-core 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/__init__.py +0 -2
- content_core/content/identification/file_detector.py +102 -11
- content_core/processors/audio.py +8 -4
- {content_core-1.5.0.dist-info → content_core-1.6.0.dist-info}/METADATA +1 -1
- {content_core-1.5.0.dist-info → content_core-1.6.0.dist-info}/RECORD +8 -8
- {content_core-1.5.0.dist-info → content_core-1.6.0.dist-info}/WHEEL +0 -0
- {content_core-1.5.0.dist-info → content_core-1.6.0.dist-info}/entry_points.txt +0 -0
- {content_core-1.5.0.dist-info → content_core-1.6.0.dist-info}/licenses/LICENSE +0 -0
content_core/__init__.py
CHANGED
|
@@ -13,10 +13,17 @@ from content_core.logging import logger
|
|
|
13
13
|
|
|
14
14
|
class FileDetector:
|
|
15
15
|
"""Pure Python file type detection using magic bytes and content analysis."""
|
|
16
|
-
|
|
17
|
-
# Configuration constants
|
|
16
|
+
|
|
17
|
+
# Configuration constants for binary/text detection
|
|
18
18
|
SIGNATURE_READ_SIZE = 512 # Bytes to read for binary signature detection
|
|
19
19
|
TEXT_READ_SIZE = 1024 # Bytes to read for text content analysis
|
|
20
|
+
|
|
21
|
+
# Configuration constants for CSV detection
|
|
22
|
+
CSV_MAX_FIELD_LENGTH = 100 # Maximum average field length for CSV (longer suggests prose)
|
|
23
|
+
CSV_MAX_VARIANCE = 500 # Maximum variance in field lengths (higher suggests natural text)
|
|
24
|
+
CSV_MIN_SCORE = 2 # Minimum score required to classify as CSV
|
|
25
|
+
CSV_MIN_FIELDS = 2 # Minimum number of fields required for CSV
|
|
26
|
+
CSV_MAX_HEADER_FIELD_LENGTH = 50 # Maximum length for individual header fields
|
|
20
27
|
|
|
21
28
|
def __init__(self):
|
|
22
29
|
"""Initialize the FileDetector with signature mappings."""
|
|
@@ -364,18 +371,102 @@ class FileDetector:
|
|
|
364
371
|
|
|
365
372
|
|
|
366
373
|
def _looks_like_csv(self, content: str) -> bool:
|
|
367
|
-
"""
|
|
368
|
-
|
|
369
|
-
|
|
374
|
+
"""
|
|
375
|
+
Check if content looks like CSV format with improved heuristics.
|
|
376
|
+
|
|
377
|
+
Uses a multi-stage approach with performance optimization:
|
|
378
|
+
1. Basic structural checks (cheap)
|
|
379
|
+
2. Field length analysis (cheap, early exit)
|
|
380
|
+
3. Pattern matching (moderate cost)
|
|
381
|
+
4. Variance analysis (expensive, only if needed)
|
|
382
|
+
"""
|
|
383
|
+
lines = content.split('\n', 10)[:10] # Check first 10 lines for better accuracy
|
|
384
|
+
non_empty_lines = [line for line in lines if line.strip()]
|
|
385
|
+
|
|
386
|
+
# Stage 1: Basic structural checks
|
|
387
|
+
if len(non_empty_lines) < 2:
|
|
370
388
|
return False
|
|
371
|
-
|
|
389
|
+
|
|
372
390
|
# Count commas in each line
|
|
373
|
-
comma_counts = [line.count(',') for line in
|
|
374
|
-
|
|
391
|
+
comma_counts = [line.count(',') for line in non_empty_lines]
|
|
392
|
+
|
|
393
|
+
# Must have at least one comma per line
|
|
394
|
+
if not all(count > 0 for count in comma_counts):
|
|
375
395
|
return False
|
|
376
|
-
|
|
377
|
-
# CSV should have consistent comma counts
|
|
378
|
-
|
|
396
|
+
|
|
397
|
+
# CSV should have consistent comma counts across lines
|
|
398
|
+
if len(set(comma_counts)) != 1:
|
|
399
|
+
return False
|
|
400
|
+
|
|
401
|
+
num_fields = comma_counts[0] + 1 # Number of fields = commas + 1
|
|
402
|
+
|
|
403
|
+
# Must have minimum number of fields to be CSV
|
|
404
|
+
if num_fields < self.CSV_MIN_FIELDS:
|
|
405
|
+
return False
|
|
406
|
+
|
|
407
|
+
# Stage 2: Field length analysis (PERFORMANCE OPTIMIZATION: early exit)
|
|
408
|
+
first_line = non_empty_lines[0]
|
|
409
|
+
fields = first_line.split(',')
|
|
410
|
+
|
|
411
|
+
# CSV fields should be relatively short (not long sentences)
|
|
412
|
+
# Average field length should be reasonable (not paragraphs)
|
|
413
|
+
# Early exit avoids expensive variance calculations for obvious prose
|
|
414
|
+
avg_field_length = sum(len(f.strip()) for f in fields) / len(fields)
|
|
415
|
+
if avg_field_length > self.CSV_MAX_FIELD_LENGTH:
|
|
416
|
+
return False # Too long to be typical CSV fields - exit early
|
|
417
|
+
|
|
418
|
+
# Stage 3: Pattern matching
|
|
419
|
+
# Check for CSV-like patterns:
|
|
420
|
+
# 1. Fields that look like headers (short, alphanumeric)
|
|
421
|
+
# 2. Quoted fields (common in CSV)
|
|
422
|
+
# 3. Numeric fields
|
|
423
|
+
has_quoted_fields = any('"' in line or "'" in line for line in non_empty_lines[:3])
|
|
424
|
+
|
|
425
|
+
first_line_fields = [f.strip() for f in fields]
|
|
426
|
+
# Check if first line looks like a header (short, no sentence-ending punctuation)
|
|
427
|
+
looks_like_header = all(
|
|
428
|
+
len(f) < self.CSV_MAX_HEADER_FIELD_LENGTH and not f.endswith('.') and not f.endswith('!')
|
|
429
|
+
for f in first_line_fields
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Stage 4: Variance analysis (EXPENSIVE - only if we have enough data)
|
|
433
|
+
# Check if subsequent lines have similar field structure
|
|
434
|
+
# Real CSV tends to have consistent field lengths
|
|
435
|
+
if len(non_empty_lines) >= 3:
|
|
436
|
+
field_lengths_per_line = []
|
|
437
|
+
for line in non_empty_lines[:5]:
|
|
438
|
+
line_fields = line.split(',')
|
|
439
|
+
field_lengths = [len(f.strip()) for f in line_fields]
|
|
440
|
+
field_lengths_per_line.append(field_lengths)
|
|
441
|
+
|
|
442
|
+
# Calculate variance in field positions
|
|
443
|
+
# CSV data should have relatively consistent field lengths at each position
|
|
444
|
+
# Natural text with commas will have much more variance
|
|
445
|
+
position_variances = []
|
|
446
|
+
for i in range(num_fields):
|
|
447
|
+
lengths_at_position = [fl[i] if i < len(fl) else 0 for fl in field_lengths_per_line]
|
|
448
|
+
if lengths_at_position:
|
|
449
|
+
avg = sum(lengths_at_position) / len(lengths_at_position)
|
|
450
|
+
variance = sum((x - avg) ** 2 for x in lengths_at_position) / len(lengths_at_position)
|
|
451
|
+
position_variances.append(variance)
|
|
452
|
+
|
|
453
|
+
# High variance suggests natural text, not structured CSV
|
|
454
|
+
if position_variances:
|
|
455
|
+
avg_variance = sum(position_variances) / len(position_variances)
|
|
456
|
+
if avg_variance > self.CSV_MAX_VARIANCE:
|
|
457
|
+
return False # Very high variance = likely prose
|
|
458
|
+
|
|
459
|
+
# Scoring: Require at least some CSV-like characteristics
|
|
460
|
+
csv_score = 0
|
|
461
|
+
if looks_like_header:
|
|
462
|
+
csv_score += 1
|
|
463
|
+
if has_quoted_fields:
|
|
464
|
+
csv_score += 1
|
|
465
|
+
if num_fields >= 3: # Multiple fields is more CSV-like
|
|
466
|
+
csv_score += 1
|
|
467
|
+
|
|
468
|
+
# Need minimum score to confidently classify as CSV
|
|
469
|
+
return csv_score >= self.CSV_MIN_SCORE
|
|
379
470
|
|
|
380
471
|
|
|
381
472
|
def _is_text_file(self, content: str) -> bool:
|
content_core/processors/audio.py
CHANGED
|
@@ -45,7 +45,7 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
|
|
|
45
45
|
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
46
46
|
|
|
47
47
|
# Extract segment
|
|
48
|
-
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
|
|
48
|
+
output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
|
|
49
49
|
output_path = os.path.join(output_dir, output_filename)
|
|
50
50
|
|
|
51
51
|
# Export segment
|
|
@@ -53,7 +53,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
|
|
|
53
53
|
|
|
54
54
|
output_files.append(output_path)
|
|
55
55
|
|
|
56
|
-
logger.debug(
|
|
56
|
+
logger.debug(
|
|
57
|
+
f"Exported segment {i + 1}/{total_segments}: {output_filename}"
|
|
58
|
+
)
|
|
57
59
|
|
|
58
60
|
return output_files
|
|
59
61
|
|
|
@@ -172,7 +174,7 @@ async def extract_audio_data(data: ProcessSourceState):
|
|
|
172
174
|
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
173
175
|
|
|
174
176
|
# Extract segment
|
|
175
|
-
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
|
|
177
|
+
output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
|
|
176
178
|
output_path = os.path.join(output_dir, output_filename)
|
|
177
179
|
|
|
178
180
|
extract_audio(input_audio_path, output_path, start_time, end_time)
|
|
@@ -193,7 +195,9 @@ async def extract_audio_data(data: ProcessSourceState):
|
|
|
193
195
|
concurrency = get_audio_concurrency()
|
|
194
196
|
semaphore = asyncio.Semaphore(concurrency)
|
|
195
197
|
|
|
196
|
-
logger.debug(
|
|
198
|
+
logger.debug(
|
|
199
|
+
f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}"
|
|
200
|
+
)
|
|
197
201
|
|
|
198
202
|
# Create tasks for parallel transcription
|
|
199
203
|
transcription_tasks = [
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
content_core/__init__.py,sha256=
|
|
1
|
+
content_core/__init__.py,sha256=c1qw2HyV8DqBnoB9fbXNIZtda60MrxGUvWG5EQFtr-8,6492
|
|
2
2
|
content_core/cc_config.yaml,sha256=3Ot5u-YSBx2k3JXWnCP7s7OVBbGpGebBy_CWj3we-u4,1211
|
|
3
3
|
content_core/config.py,sha256=Mao6AZZoiSiX7uZwOGgk759LlV0j6NdfYGgWgX6vhAs,7112
|
|
4
4
|
content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
|
|
@@ -17,14 +17,14 @@ content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnR
|
|
|
17
17
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
18
18
|
content_core/content/extraction/graph.py,sha256=AFi9B_hTuxqdgvogCOk4Xdqoboug7_KXtV0ZHlb8igM,8139
|
|
19
19
|
content_core/content/identification/__init__.py,sha256=DDoCi1r-6Z_pGPPi3X1ZwyRrcRtg-rAiCTK50hnO5Y0,235
|
|
20
|
-
content_core/content/identification/file_detector.py,sha256=
|
|
20
|
+
content_core/content/identification/file_detector.py,sha256=GBP1cCNc1qnt5_HkE5ALQDja2f8WgqCC5h1r8xF0P7A,21480
|
|
21
21
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
22
22
|
content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
|
|
23
23
|
content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
|
|
24
24
|
content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
|
|
25
25
|
content_core/notebooks/run.ipynb,sha256=8gbFln9WLrli_qWJB8SKQKcSNbAv25DvN5Cu4EAAeBQ,370952
|
|
26
26
|
content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
|
|
27
|
-
content_core/processors/audio.py,sha256=
|
|
27
|
+
content_core/processors/audio.py,sha256=CYwoTDPsVUDALHuz_EHcnjVfsKF8XjQmvmX8c-OmMNU,8462
|
|
28
28
|
content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
|
|
29
29
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
30
30
|
content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
|
|
@@ -36,8 +36,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
|
|
|
36
36
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
37
37
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
38
38
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
39
|
-
content_core-1.
|
|
40
|
-
content_core-1.
|
|
41
|
-
content_core-1.
|
|
42
|
-
content_core-1.
|
|
43
|
-
content_core-1.
|
|
39
|
+
content_core-1.6.0.dist-info/METADATA,sha256=bBxEINm9h2ppJIia11flDRDH7UshzamVrHKHGxHrmjs,21963
|
|
40
|
+
content_core-1.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
41
|
+
content_core-1.6.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
|
|
42
|
+
content_core-1.6.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
43
|
+
content_core-1.6.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|