content-core 1.0.0__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {content_core-1.0.0 → content_core-1.0.2}/PKG-INFO +3 -2
  2. {content_core-1.0.0 → content_core-1.0.2}/pyproject.toml +4 -2
  3. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/processors/youtube.py +80 -62
  4. {content_core-1.0.0 → content_core-1.0.2}/uv.lock +14 -8
  5. {content_core-1.0.0 → content_core-1.0.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  6. {content_core-1.0.0 → content_core-1.0.2}/.github/workflows/publish.yml +0 -0
  7. {content_core-1.0.0 → content_core-1.0.2}/.gitignore +0 -0
  8. {content_core-1.0.0 → content_core-1.0.2}/.python-version +0 -0
  9. {content_core-1.0.0 → content_core-1.0.2}/CONTRIBUTING.md +0 -0
  10. {content_core-1.0.0 → content_core-1.0.2}/LICENSE +0 -0
  11. {content_core-1.0.0 → content_core-1.0.2}/Makefile +0 -0
  12. {content_core-1.0.0 → content_core-1.0.2}/README.md +0 -0
  13. {content_core-1.0.0 → content_core-1.0.2}/docs/processors.md +0 -0
  14. {content_core-1.0.0 → content_core-1.0.2}/docs/usage.md +0 -0
  15. {content_core-1.0.0 → content_core-1.0.2}/prompts/content/cleanup.jinja +0 -0
  16. {content_core-1.0.0 → content_core-1.0.2}/prompts/content/summarize.jinja +0 -0
  17. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/__init__.py +0 -0
  18. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/cc_config.yaml +0 -0
  19. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/common/__init__.py +0 -0
  20. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/common/exceptions.py +0 -0
  21. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/common/state.py +0 -0
  22. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/common/types.py +0 -0
  23. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/common/utils.py +0 -0
  24. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/config.py +0 -0
  25. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/content/__init__.py +0 -0
  26. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/content/cleanup/__init__.py +0 -0
  27. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/content/cleanup/core.py +0 -0
  28. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/content/extraction/__init__.py +0 -0
  29. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/content/extraction/graph.py +0 -0
  30. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/content/identification/__init__.py +0 -0
  31. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/content/summary/__init__.py +0 -0
  32. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/content/summary/core.py +0 -0
  33. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/logging.py +0 -0
  34. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/models.py +0 -0
  35. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/models_config.yaml +0 -0
  36. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/notebooks/run.ipynb +0 -0
  37. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/processors/audio.py +0 -0
  38. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/processors/docling.py +0 -0
  39. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/processors/office.py +0 -0
  40. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/processors/pdf.py +0 -0
  41. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/processors/text.py +0 -0
  42. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/processors/url.py +0 -0
  43. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/processors/video.py +0 -0
  44. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/py.typed +0 -0
  45. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/templated_message.py +0 -0
  46. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/tools/__init__.py +0 -0
  47. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/tools/cleanup.py +0 -0
  48. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/tools/extract.py +0 -0
  49. {content_core-1.0.0 → content_core-1.0.2}/src/content_core/tools/summarize.py +0 -0
  50. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file.docx +0 -0
  51. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file.epub +0 -0
  52. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file.md +0 -0
  53. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file.mp3 +0 -0
  54. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file.mp4 +0 -0
  55. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file.pdf +0 -0
  56. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file.pptx +0 -0
  57. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file.txt +0 -0
  58. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file.xlsx +0 -0
  59. {content_core-1.0.0 → content_core-1.0.2}/tests/input_content/file_audio.mp3 +0 -0
  60. {content_core-1.0.0 → content_core-1.0.2}/tests/integration/test_cli.py +0 -0
  61. {content_core-1.0.0 → content_core-1.0.2}/tests/integration/test_extraction.py +0 -0
  62. {content_core-1.0.0 → content_core-1.0.2}/tests/unit/test_docling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -11,7 +11,7 @@ Requires-Dist: asciidoc>=10.2.1
11
11
  Requires-Dist: bs4>=0.0.2
12
12
  Requires-Dist: dicttoxml>=1.7.16
13
13
  Requires-Dist: docling>=2.34.0
14
- Requires-Dist: esperanto[openai]>=1.2.0
14
+ Requires-Dist: esperanto>=1.2.0
15
15
  Requires-Dist: firecrawl-py>=2.7.0
16
16
  Requires-Dist: jinja2>=3.1.6
17
17
  Requires-Dist: langdetect>=1.0.9
@@ -24,6 +24,7 @@ Requires-Dist: pillow>=10.4.0
24
24
  Requires-Dist: pymupdf>=1.25.5
25
25
  Requires-Dist: python-docx>=1.1.2
26
26
  Requires-Dist: python-dotenv>=1.1.0
27
+ Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
27
28
  Requires-Dist: python-magic>=0.4.27
28
29
  Requires-Dist: python-pptx>=1.0.2
29
30
  Requires-Dist: readability-lxml>=0.8.4.1
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "1.0.0"
3
+ version = "1.0.2"
4
4
  description = "Extract what matters from any media source"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -11,7 +11,7 @@ requires-python = ">=3.10"
11
11
  dependencies = [
12
12
  "aiohttp>=3.11",
13
13
  "bs4>=0.0.2",
14
- "esperanto[openai]>=1.2.0",
14
+ "esperanto>=1.2.0",
15
15
  "jinja2>=3.1.6",
16
16
  "langdetect>=1.0.9",
17
17
  "loguru>=0.7.3",
@@ -33,6 +33,8 @@ dependencies = [
33
33
  "docling>=2.34.0",
34
34
  "pillow>=10.4.0",
35
35
  "asciidoc>=10.2.1",
36
+ "python-magic-bin==0.4.14; sys_platform == 'win32'",
37
+
36
38
  ]
37
39
 
38
40
  [project.scripts]
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  import re
2
3
  import ssl
3
4
 
@@ -68,69 +69,86 @@ async def _extract_youtube_id(url):
68
69
 
69
70
 
70
71
  async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
71
- try:
72
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
73
-
74
- # First try: Manual transcripts in preferred languages
75
- manual_transcripts = []
76
- try:
77
- for transcript in transcript_list:
78
- if not transcript.is_generated and not transcript.is_translatable:
79
- manual_transcripts.append(transcript)
80
-
81
- if manual_transcripts:
82
- # Sort based on preferred language order
83
- for lang in preferred_langs:
84
- for transcript in manual_transcripts:
85
- if transcript.language_code == lang:
86
- return transcript.fetch()
87
- # If no preferred language found, return first manual transcript
88
- return manual_transcripts[0].fetch()
89
- except NoTranscriptFound:
90
- pass
91
-
92
- # Second try: Auto-generated transcripts in preferred languages
93
- generated_transcripts = []
94
- try:
95
- for transcript in transcript_list:
96
- if transcript.is_generated and not transcript.is_translatable:
97
- generated_transcripts.append(transcript)
98
-
99
- if generated_transcripts:
100
- # Sort based on preferred language order
101
- for lang in preferred_langs:
102
- for transcript in generated_transcripts:
103
- if transcript.language_code == lang:
104
- return transcript.fetch()
105
- # If no preferred language found, return first generated transcript
106
- return generated_transcripts[0].fetch()
107
- except NoTranscriptFound:
108
- pass
109
-
110
- # Last try: Translated transcripts in preferred languages
111
- translated_transcripts = []
72
+ max_attempts = 5
73
+ for attempt in range(max_attempts):
112
74
  try:
113
- for transcript in transcript_list:
114
- if transcript.is_translatable:
115
- translated_transcripts.append(transcript)
116
-
117
- if translated_transcripts:
118
- # Sort based on preferred language order
119
- for lang in preferred_langs:
120
- for transcript in translated_transcripts:
121
- if transcript.language_code == lang:
122
- return transcript.fetch()
123
- # If no preferred language found, return translation to first preferred language
124
- translation = translated_transcripts[0].translate(preferred_langs[0])
125
- return translation.fetch()
126
- except NoTranscriptFound:
127
- pass
128
-
129
- raise Exception("No suitable transcript found")
130
-
131
- except Exception as e:
132
- logger.error(f"Failed to get transcript for video {video_id}: {e}")
133
- return None
75
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
76
+
77
+ # First try: Manual transcripts in preferred languages
78
+ manual_transcripts = []
79
+ try:
80
+ for transcript in transcript_list:
81
+ if not transcript.is_generated and not transcript.is_translatable:
82
+ manual_transcripts.append(transcript)
83
+
84
+ if manual_transcripts:
85
+ # Sort based on preferred language order
86
+ for lang in preferred_langs:
87
+ for transcript in manual_transcripts:
88
+ if transcript.language_code == lang:
89
+ return transcript.fetch()
90
+ # If no preferred language found, return first manual transcript
91
+ return manual_transcripts[0].fetch()
92
+ except NoTranscriptFound:
93
+ pass
94
+
95
+ # Second try: Auto-generated transcripts in preferred languages
96
+ generated_transcripts = []
97
+ try:
98
+ for transcript in transcript_list:
99
+ if transcript.is_generated and not transcript.is_translatable:
100
+ generated_transcripts.append(transcript)
101
+
102
+ if generated_transcripts:
103
+ # Sort based on preferred language order
104
+ for lang in preferred_langs:
105
+ for transcript in generated_transcripts:
106
+ if transcript.language_code == lang:
107
+ return transcript.fetch()
108
+ # If no preferred language found, return first generated transcript
109
+ return generated_transcripts[0].fetch()
110
+ except NoTranscriptFound:
111
+ pass
112
+
113
+ # Last try: Translated transcripts in preferred languages
114
+ translated_transcripts = []
115
+ try:
116
+ for transcript in transcript_list:
117
+ if transcript.is_translatable:
118
+ translated_transcripts.append(transcript)
119
+
120
+ if translated_transcripts:
121
+ # Sort based on preferred language order
122
+ for lang in preferred_langs:
123
+ for transcript in translated_transcripts:
124
+ if transcript.language_code == lang:
125
+ return transcript.fetch()
126
+ # If no preferred language found, return translation to first preferred language
127
+ translation = translated_transcripts[0].translate(
128
+ preferred_langs[0]
129
+ )
130
+ return translation.fetch()
131
+ except NoTranscriptFound:
132
+ pass
133
+
134
+ raise Exception("No suitable transcript found")
135
+
136
+ except Exception as e:
137
+ if e.__class__.__name__ == "ParserError":
138
+ logger.warning(
139
+ f"ParserError on attempt {attempt+1}/5 for video {video_id}. Retrying..."
140
+ )
141
+ if attempt == max_attempts - 1:
142
+ logger.error(
143
+ f"Failed to get transcript for video {video_id} after {max_attempts} attempts due to repeated ParserError."
144
+ )
145
+ return None
146
+ await asyncio.sleep(2)
147
+ continue
148
+ else:
149
+ logger.error(f"Failed to get transcript for video {video_id}: {e}")
150
+ return None
151
+ return None
134
152
 
135
153
 
136
154
  async def extract_youtube_transcript(state: ProcessSourceState):
@@ -410,7 +410,7 @@ wheels = [
410
410
 
411
411
  [[package]]
412
412
  name = "content-core"
413
- version = "1.0"
413
+ version = "1.0.2"
414
414
  source = { editable = "." }
415
415
  dependencies = [
416
416
  { name = "ai-prompter" },
@@ -419,7 +419,7 @@ dependencies = [
419
419
  { name = "bs4" },
420
420
  { name = "dicttoxml" },
421
421
  { name = "docling" },
422
- { name = "esperanto", extra = ["openai"] },
422
+ { name = "esperanto" },
423
423
  { name = "firecrawl-py" },
424
424
  { name = "jinja2" },
425
425
  { name = "langdetect" },
@@ -433,6 +433,7 @@ dependencies = [
433
433
  { name = "python-docx" },
434
434
  { name = "python-dotenv" },
435
435
  { name = "python-magic" },
436
+ { name = "python-magic-bin", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux' and sys_platform == 'win32') or (platform_system != 'Darwin' and platform_system != 'Linux' and sys_platform == 'win32')" },
436
437
  { name = "python-pptx" },
437
438
  { name = "readability-lxml" },
438
439
  { name = "validators" },
@@ -457,7 +458,7 @@ requires-dist = [
457
458
  { name = "bs4", specifier = ">=0.0.2" },
458
459
  { name = "dicttoxml", specifier = ">=1.7.16" },
459
460
  { name = "docling", specifier = ">=2.34.0" },
460
- { name = "esperanto", extras = ["openai"], specifier = ">=1.2.0" },
461
+ { name = "esperanto", specifier = ">=1.2.0" },
461
462
  { name = "firecrawl-py", specifier = ">=2.7.0" },
462
463
  { name = "jinja2", specifier = ">=3.1.6" },
463
464
  { name = "langdetect", specifier = ">=1.0.9" },
@@ -471,6 +472,7 @@ requires-dist = [
471
472
  { name = "python-docx", specifier = ">=1.1.2" },
472
473
  { name = "python-dotenv", specifier = ">=1.1.0" },
473
474
  { name = "python-magic", specifier = ">=0.4.27" },
475
+ { name = "python-magic-bin", marker = "sys_platform == 'win32'", specifier = "==0.4.14" },
474
476
  { name = "python-pptx", specifier = ">=1.0.2" },
475
477
  { name = "readability-lxml", specifier = ">=0.8.4.1" },
476
478
  { name = "validators", specifier = ">=0.34.0" },
@@ -729,11 +731,6 @@ wheels = [
729
731
  { url = "https://files.pythonhosted.org/packages/f9/79/5d74f2b8f9d73da83bfe80a39ff11505a2a285c03a869750db98cd89ddfd/esperanto-1.2.1-py3-none-any.whl", hash = "sha256:2fa41e5e35c847b1fe58395906d8877035f7e55d6429870d897781f7c9f17c42", size = 57680 },
730
732
  ]
731
733
 
732
- [package.optional-dependencies]
733
- openai = [
734
- { name = "openai" },
735
- ]
736
-
737
734
  [[package]]
738
735
  name = "et-xmlfile"
739
736
  version = "2.0.0"
@@ -2919,6 +2916,15 @@ wheels = [
2919
2916
  { url = "https://files.pythonhosted.org/packages/6c/73/9f872cb81fc5c3bb48f7227872c28975f998f3e7c2b1c16e95e6432bbb90/python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3", size = 13840 },
2920
2917
  ]
2921
2918
 
2919
+ [[package]]
2920
+ name = "python-magic-bin"
2921
+ version = "0.4.14"
2922
+ source = { registry = "https://pypi.org/simple" }
2923
+ wheels = [
2924
+ { url = "https://files.pythonhosted.org/packages/5a/5d/10b9ac745d9fd2f7151a2ab901e6bb6983dbd70e87c71111f54859d1ca2e/python_magic_bin-0.4.14-py2.py3-none-win32.whl", hash = "sha256:34a788c03adde7608028203e2dbb208f1f62225ad91518787ae26d603ae68892", size = 397784 },
2925
+ { url = "https://files.pythonhosted.org/packages/07/c2/094e3d62b906d952537196603a23aec4bcd7c6126bf80eb14e6f9f4be3a2/python_magic_bin-0.4.14-py2.py3-none-win_amd64.whl", hash = "sha256:90be6206ad31071a36065a2fc169c5afb5e0355cbe6030e87641c6c62edc2b69", size = 409299 },
2926
+ ]
2927
+
2922
2928
  [[package]]
2923
2929
  name = "python-pptx"
2924
2930
  version = "1.0.2"
File without changes
File without changes
File without changes
File without changes
File without changes