content-core 0.3.1__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (58) hide show
  1. {content_core-0.3.1 → content_core-0.4.0}/.windsurfrules +2 -2
  2. {content_core-0.3.1 → content_core-0.4.0}/PKG-INFO +1 -1
  3. {content_core-0.3.1 → content_core-0.4.0}/pyproject.toml +1 -1
  4. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/content/extraction/graph.py +22 -1
  5. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/processors/url.py +14 -5
  6. {content_core-0.3.1 → content_core-0.4.0}/tests/integration/test_extraction.py +10 -0
  7. {content_core-0.3.1 → content_core-0.4.0}/uv.lock +2 -2
  8. {content_core-0.3.1 → content_core-0.4.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  9. {content_core-0.3.1 → content_core-0.4.0}/.github/workflows/publish.yml +0 -0
  10. {content_core-0.3.1 → content_core-0.4.0}/.gitignore +0 -0
  11. {content_core-0.3.1 → content_core-0.4.0}/.python-version +0 -0
  12. {content_core-0.3.1 → content_core-0.4.0}/CONTRIBUTING.md +0 -0
  13. {content_core-0.3.1 → content_core-0.4.0}/LICENSE +0 -0
  14. {content_core-0.3.1 → content_core-0.4.0}/Makefile +0 -0
  15. {content_core-0.3.1 → content_core-0.4.0}/README.md +0 -0
  16. {content_core-0.3.1 → content_core-0.4.0}/docs/processors.md +0 -0
  17. {content_core-0.3.1 → content_core-0.4.0}/docs/usage.md +0 -0
  18. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/__init__.py +0 -0
  19. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/common/__init__.py +0 -0
  20. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/common/exceptions.py +0 -0
  21. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/common/state.py +0 -0
  22. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/common/utils.py +0 -0
  23. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/config.py +0 -0
  24. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/content/__init__.py +0 -0
  25. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/content/cleanup/__init__.py +0 -0
  26. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/content/cleanup/core.py +0 -0
  27. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/content/extraction/__init__.py +0 -0
  28. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/content/summary/__init__.py +0 -0
  29. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/content/summary/core.py +0 -0
  30. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/logging.py +0 -0
  31. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/models.py +0 -0
  32. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/models_config.yaml +0 -0
  33. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/notebooks/run.ipynb +0 -0
  34. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/processors/audio.py +0 -0
  35. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/processors/office.py +0 -0
  36. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/processors/pdf.py +0 -0
  37. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/processors/text.py +0 -0
  38. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/processors/video.py +0 -0
  39. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/processors/youtube.py +0 -0
  40. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/prompter.py +0 -0
  41. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/prompts/content/cleanup.jinja +0 -0
  42. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/prompts/content/summarize.jinja +0 -0
  43. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/py.typed +0 -0
  44. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/templated_message.py +0 -0
  45. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/tools/__init__.py +0 -0
  46. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/tools/cleanup.py +0 -0
  47. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/tools/extract.py +0 -0
  48. {content_core-0.3.1 → content_core-0.4.0}/src/content_core/tools/summarize.py +0 -0
  49. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file.docx +0 -0
  50. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file.epub +0 -0
  51. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file.md +0 -0
  52. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file.mp3 +0 -0
  53. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file.mp4 +0 -0
  54. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file.pdf +0 -0
  55. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file.pptx +0 -0
  56. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file.txt +0 -0
  57. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file.xlsx +0 -0
  58. {content_core-0.3.1 → content_core-0.4.0}/tests/input_content/file_audio.mp3 +0 -0
@@ -4,10 +4,10 @@ All documentation (code or readmes) must be in english.
4
4
  Whenever I ask you to tag and release, make sure to run `make test` as part of the process.
5
5
 
6
6
  The full release process is:
7
- - Run `make test` to make sure everything is working
7
+ - Run `make test` to make sure everything is working (if we changed any code or import)
8
8
  - Update version on pyproject.toml
9
9
  - Run `uv sync` to update the lock file
10
10
  - Commit all that's needed
11
- - Merge to main
11
+ - Merge to main (if in a branch)
12
12
  - Tag the release
13
13
  - Push to GitHub
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.3.1
3
+ Version: 0.4.0
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "0.3.1"
3
+ version = "0.4.0"
4
4
  description = "Extract what matters from any media source"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -21,6 +21,10 @@ from content_core.processors.url import extract_url, url_provider
21
21
  from content_core.processors.video import extract_best_audio_from_video
22
22
  from content_core.processors.youtube import extract_youtube_transcript
23
23
 
24
+ import aiohttp
25
+ import tempfile
26
+ from urllib.parse import urlparse
27
+
24
28
 
25
29
  async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
26
30
  """
@@ -91,6 +95,21 @@ async def source_type_router(x: ProcessSourceState) -> Optional[str]:
91
95
  return x.source_type
92
96
 
93
97
 
98
+ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
99
+ url = state.url
100
+ assert url, "No URL provided"
101
+ async with aiohttp.ClientSession() as session:
102
+ async with session.get(url) as resp:
103
+ resp.raise_for_status()
104
+ mime = resp.headers.get("content-type", "").split(";", 1)[0]
105
+ suffix = os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
106
+ fd, tmp = tempfile.mkstemp(suffix=suffix)
107
+ os.close(fd)
108
+ with open(tmp, "wb") as f:
109
+ f.write(await resp.read())
110
+ return {"file_path": tmp, "identified_type": mime}
111
+
112
+
94
113
  # Create workflow
95
114
  workflow = StateGraph(
96
115
  ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
@@ -108,6 +127,7 @@ workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video
108
127
  workflow.add_node("extract_audio", extract_audio)
109
128
  workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
110
129
  workflow.add_node("delete_file", delete_file)
130
+ workflow.add_node("download_remote_file", download_remote_file)
111
131
 
112
132
  # Add edges
113
133
  workflow.add_edge(START, "source")
@@ -127,7 +147,7 @@ workflow.add_conditional_edges(
127
147
  workflow.add_conditional_edges(
128
148
  "url_provider",
129
149
  url_type_router,
130
- {"article": "extract_url", "youtube": "extract_youtube_transcript"},
150
+ {**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES}, "article": "extract_url", "youtube": "extract_youtube_transcript"},
131
151
  )
132
152
  workflow.add_edge("url_provider", END)
133
153
  workflow.add_edge("file_type", END)
@@ -140,6 +160,7 @@ workflow.add_edge("extract_office_content", "delete_file")
140
160
  workflow.add_edge("extract_best_audio_from_video", "extract_audio")
141
161
  workflow.add_edge("extract_audio", "delete_file")
142
162
  workflow.add_edge("delete_file", END)
163
+ workflow.add_edge("download_remote_file", "file_type")
143
164
 
144
165
  # Compile graph
145
166
  graph = workflow.compile()
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup, Comment
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
8
  from content_core.logging import logger
9
+ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
9
10
 
10
11
  # future: better extraction methods
11
12
  # https://github.com/buriy/python-readability
@@ -20,12 +21,20 @@ async def url_provider(state: ProcessSourceState):
20
21
  url = state.url
21
22
  if url:
22
23
  if "youtube.com" in url or "youtu.be" in url:
23
- return_dict["identified_type"] = (
24
- "youtube" # future: playlists, channels in the future
25
- )
24
+ return_dict["identified_type"] = "youtube"
26
25
  else:
27
- return_dict["identified_type"] = "article"
28
- # future: article providers in the future
26
+ # remote URL: check content-type to catch PDFs
27
+ try:
28
+ async with aiohttp.ClientSession() as session:
29
+ async with session.head(url, timeout=10, allow_redirects=True) as resp:
30
+ mime = resp.headers.get("content-type", "").split(";", 1)[0]
31
+ except Exception as e:
32
+ logger.debug(f"HEAD check failed for {url}: {e}")
33
+ mime = "article"
34
+ if mime in SUPPORTED_FITZ_TYPES:
35
+ return_dict["identified_type"] = mime
36
+ else:
37
+ return_dict["identified_type"] = "article"
29
38
  return return_dict
30
39
 
31
40
 
@@ -200,3 +200,13 @@ async def test_extract_content_from_xlsx(fixture_path):
200
200
  )
201
201
  assert result.title is not None # Attempt to extract title/metadata
202
202
  assert len(result.content) > 0 # Check that some content was extracted
203
+
204
+
205
+ @pytest.mark.asyncio
206
+ async def test_extract_content_from_pdf_url():
207
+ """Tests extracting content from a remote PDF URL."""
208
+ url = "https://arxiv.org/pdf/2408.09869"
209
+ result = await extract_content({"url": url})
210
+ assert result.source_type == "url"
211
+ assert result.identified_type == "application/pdf"
212
+ assert len(result.content) > 100 # Expect substantial extracted text
@@ -354,7 +354,7 @@ wheels = [
354
354
 
355
355
  [[package]]
356
356
  name = "content-core"
357
- version = "0.3.1"
357
+ version = "0.4.0"
358
358
  source = { editable = "." }
359
359
  dependencies = [
360
360
  { name = "aiohttp" },
@@ -1018,7 +1018,7 @@ wheels = [
1018
1018
  { url = "https://files.pythonhosted.org/packages/8c/de/8eb6fffecd9c5f129461edcdd7e1ac944f9de15783e3d89c84ed6e0374bc/lxml-5.3.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:aa837e6ee9534de8d63bc4c1249e83882a7ac22bd24523f83fad68e6ffdf41ae", size = 5652903 },
1019
1019
  { url = "https://files.pythonhosted.org/packages/95/79/80f4102a08495c100014593680f3f0f7bd7c1333b13520aed855fc993326/lxml-5.3.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:da4c9223319400b97a2acdfb10926b807e51b69eb7eb80aad4942c0516934858", size = 5491813 },
1020
1020
  { url = "https://files.pythonhosted.org/packages/15/f5/9b1f7edf6565ee31e4300edb1bcc61eaebe50a3cff4053c0206d8dc772f2/lxml-5.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:dc0e9bdb3aa4d1de703a437576007d366b54f52c9897cae1a3716bb44fc1fc85", size = 5227837 },
1021
- { url = "https://files.pythonhosted.org/packages/5c/17/c31d94364c02e3492215658917f5590c00edce8074aeb06d05b7771465d9/lxml-5.3.2-cp310-cp310-win32.whl", hash = "sha256:5f94909a1022c8ea12711db7e08752ca7cf83e5b57a87b59e8a583c5f35016ad", size = 3477533 },
1021
+ { url = "https://files.pythonhosted.org/packages/dd/53/a187c4ccfcd5fbfca01e6c96da39499d8b801ab5dcf57717db95d7a968a8/lxml-5.3.2-cp310-cp310-win32.win32.whl", hash = "sha256:dd755a0a78dd0b2c43f972e7b51a43be518ebc130c9f1a7c4480cf08b4385486", size = 3477533 },
1022
1022
  { url = "https://files.pythonhosted.org/packages/f2/2c/397c5a9d76a7a0faf9e5b13143ae1a7e223e71d2197a45da71c21aacb3d4/lxml-5.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:d64ea1686474074b38da13ae218d9fde0d1dc6525266976808f41ac98d9d7980", size = 3805160 },
1023
1023
  { url = "https://files.pythonhosted.org/packages/84/b8/2b727f5a90902f7cc5548349f563b60911ca05f3b92e35dfa751349f265f/lxml-5.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9d61a7d0d208ace43986a92b111e035881c4ed45b1f5b7a270070acae8b0bfb4", size = 8163457 },
1024
1024
  { url = "https://files.pythonhosted.org/packages/91/84/23135b2dc72b3440d68c8f39ace2bb00fe78e3a2255f7c74f7e76f22498e/lxml-5.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:856dfd7eda0b75c29ac80a31a6411ca12209183e866c33faf46e77ace3ce8a79", size = 4433445 },
File without changes
File without changes
File without changes
File without changes
File without changes