not-again-ai 0.19.0__tar.gz → 0.20.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/.github/copilot-instructions.md +2 -1
  2. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/PKG-INFO +9 -4
  3. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/README.md +5 -1
  4. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/pyproject.toml +5 -4
  5. not_again_ai-0.20.0/src/not_again_ai/data/brave_search_api.py +203 -0
  6. not_again_ai-0.20.0/src/not_again_ai/data/web.py +160 -0
  7. not_again_ai-0.20.0/tests/data/test_brave_search_api.py +34 -0
  8. not_again_ai-0.20.0/tests/data/test_web.py +20 -0
  9. not_again_ai-0.20.0/tests/viz/__init__.py +0 -0
  10. not_again_ai-0.20.0/uv.lock +3593 -0
  11. not_again_ai-0.19.0/src/not_again_ai/data/__init__.py +0 -7
  12. not_again_ai-0.19.0/src/not_again_ai/data/web.py +0 -56
  13. not_again_ai-0.19.0/tests/data/test_web.py +0 -28
  14. not_again_ai-0.19.0/uv.lock +0 -2361
  15. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/.editorconfig +0 -0
  16. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/.gitattributes +0 -0
  17. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/.github/_typos.toml +0 -0
  18. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/.github/workflows/codeql-analysis.yml +0 -0
  19. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/.github/workflows/python.yml +0 -0
  20. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/.gitignore +0 -0
  21. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/.vscode/launch.json +0 -0
  22. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/.vscode/settings.json +0 -0
  23. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/LICENSE +0 -0
  24. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/assets/barplot_test4.png +0 -0
  25. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/assets/distributions_test4.svg +0 -0
  26. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/assets/scatterplot_basic1.png +0 -0
  27. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/assets/ts_lineplot5.svg +0 -0
  28. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/notebooks/base/base.ipynb +0 -0
  29. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/notebooks/llm/01_openai_chat_completion.ipynb +0 -0
  30. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/notebooks/llm/02_ollama_intro.ipynb +0 -0
  31. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/notebooks/llm/03_llm_streaming.ipynb +0 -0
  32. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/notebooks/llm/10_gpt-4-v.ipynb +0 -0
  33. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/notebooks/llm/20_embeddings.ipynb +0 -0
  34. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/notebooks/statistics/statistics.ipynb +0 -0
  35. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/notebooks/viz/viz.ipynb +0 -0
  36. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/noxfile.py +0 -0
  37. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/__init__.py +0 -0
  38. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/base/__init__.py +0 -0
  39. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/base/file_system.py +0 -0
  40. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/base/parallel.py +0 -0
  41. {not_again_ai-0.19.0/src/not_again_ai/llm → not_again_ai-0.20.0/src/not_again_ai/data}/__init__.py +0 -0
  42. {not_again_ai-0.19.0/src/not_again_ai/llm/chat_completion/providers → not_again_ai-0.20.0/src/not_again_ai/llm}/__init__.py +0 -0
  43. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/chat_completion/__init__.py +0 -0
  44. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/chat_completion/interface.py +0 -0
  45. {not_again_ai-0.19.0/src/not_again_ai/llm/embedding → not_again_ai-0.20.0/src/not_again_ai/llm/chat_completion}/providers/__init__.py +0 -0
  46. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/chat_completion/providers/anthropic_api.py +0 -0
  47. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/chat_completion/providers/gemini_api.py +0 -0
  48. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/chat_completion/providers/ollama_api.py +0 -0
  49. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/chat_completion/providers/openai_api.py +0 -0
  50. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/chat_completion/types.py +0 -0
  51. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/embedding/__init__.py +0 -0
  52. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/embedding/interface.py +0 -0
  53. {not_again_ai-0.19.0/src/not_again_ai/llm/image_gen → not_again_ai-0.20.0/src/not_again_ai/llm/embedding}/providers/__init__.py +0 -0
  54. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/embedding/providers/ollama_api.py +0 -0
  55. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/embedding/providers/openai_api.py +0 -0
  56. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/embedding/types.py +0 -0
  57. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/image_gen/__init__.py +0 -0
  58. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/image_gen/interface.py +0 -0
  59. {not_again_ai-0.19.0/src/not_again_ai/llm/prompting → not_again_ai-0.20.0/src/not_again_ai/llm/image_gen}/providers/__init__.py +0 -0
  60. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/image_gen/providers/openai_api.py +0 -0
  61. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/image_gen/types.py +0 -0
  62. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/prompting/__init__.py +0 -0
  63. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/prompting/compile_prompt.py +0 -0
  64. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/prompting/interface.py +0 -0
  65. {not_again_ai-0.19.0/tests → not_again_ai-0.20.0/src/not_again_ai/llm/prompting/providers}/__init__.py +0 -0
  66. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/prompting/providers/openai_tiktoken.py +0 -0
  67. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/llm/prompting/types.py +0 -0
  68. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/py.typed +0 -0
  69. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/statistics/__init__.py +0 -0
  70. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/statistics/dependence.py +0 -0
  71. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/viz/__init__.py +0 -0
  72. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/viz/barplots.py +0 -0
  73. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/viz/distributions.py +0 -0
  74. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/viz/scatterplot.py +0 -0
  75. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/viz/time_series.py +0 -0
  76. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/src/not_again_ai/viz/utils.py +0 -0
  77. {not_again_ai-0.19.0/tests/base → not_again_ai-0.20.0/tests}/__init__.py +0 -0
  78. {not_again_ai-0.19.0/tests/data → not_again_ai-0.20.0/tests/base}/__init__.py +0 -0
  79. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/base/test_file_system.py +0 -0
  80. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/base/test_parallel.py +0 -0
  81. {not_again_ai-0.19.0/tests/llm → not_again_ai-0.20.0/tests/data}/__init__.py +0 -0
  82. {not_again_ai-0.19.0/tests/llm/chat_completion → not_again_ai-0.20.0/tests/llm}/__init__.py +0 -0
  83. {not_again_ai-0.19.0/tests/llm/embedding → not_again_ai-0.20.0/tests/llm/chat_completion}/__init__.py +0 -0
  84. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/chat_completion/test_chat_completion.py +0 -0
  85. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/chat_completion/test_chat_completion_stream.py +0 -0
  86. {not_again_ai-0.19.0/tests/llm/prompting → not_again_ai-0.20.0/tests/llm/embedding}/__init__.py +0 -0
  87. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/embedding/test_embedding.py +0 -0
  88. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/image_gen/test_image_gen.py +0 -0
  89. {not_again_ai-0.19.0/tests/statistics → not_again_ai-0.20.0/tests/llm/prompting}/__init__.py +0 -0
  90. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/prompting/test_compile_messages.py +0 -0
  91. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/prompting/test_tokenizer.py +0 -0
  92. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/sample_images/SKDiagram.png +0 -0
  93. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/sample_images/SKInfographic.png +0 -0
  94. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/sample_images/body_lotion.png +0 -0
  95. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/sample_images/cat.jpg +0 -0
  96. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/sample_images/dog.jpg +0 -0
  97. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/sample_images/numbers.png +0 -0
  98. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/sample_images/soap.png +0 -0
  99. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/sample_images/sunlit_lounge.png +0 -0
  100. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/llm/sample_images/sunlit_lounge_mask.png +0 -0
  101. {not_again_ai-0.19.0/tests/viz → not_again_ai-0.20.0/tests/statistics}/__init__.py +0 -0
  102. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/statistics/test_dependence.py +0 -0
  103. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/viz/test_barplot.py +0 -0
  104. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/viz/test_distributions.py +0 -0
  105. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/viz/test_scatterplot.py +0 -0
  106. {not_again_ai-0.19.0 → not_again_ai-0.20.0}/tests/viz/test_time_series.py +0 -0
@@ -7,4 +7,5 @@
7
7
  - If the user is using Pydantic, it is version >=2.10
8
8
  - Always prefer pathlib for dealing with files. Use `Path.open` instead of `open`.
9
9
  - Prefer to use pendulum instead of datetime
10
- - Prefer to use loguru instead of logging
10
+ - Prefer to use loguru instead of logging
11
+ - Prefer httpx for HTTP requests instead of requests
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: not-again-ai
3
- Version: 0.19.0
3
+ Version: 0.20.0
4
4
  Summary: Designed to once and for all collect all the little things that come up over and over again in AI projects and put them in one place.
5
5
  Project-URL: Homepage, https://github.com/DaveCoDev/not-again-ai
6
6
  Project-URL: Documentation, https://davecodev.github.io/not-again-ai/
@@ -22,8 +22,9 @@ Requires-Python: >=3.11
22
22
  Requires-Dist: loguru<1.0,>=0.7
23
23
  Requires-Dist: pydantic<3.0,>=2.11
24
24
  Provides-Extra: data
25
- Requires-Dist: playwright<2.0,>=1.51; extra == 'data'
26
- Requires-Dist: pytest-playwright<1.0,>=0.7; extra == 'data'
25
+ Requires-Dist: crawl4ai<1.0,>=0.6; extra == 'data'
26
+ Requires-Dist: httpx<1.0,>=0.28; extra == 'data'
27
+ Requires-Dist: markitdown[pdf]==0.1.2; extra == 'data'
27
28
  Provides-Extra: llm
28
29
  Requires-Dist: anthropic<1.0,>=0.50; extra == 'llm'
29
30
  Requires-Dist: azure-identity<2.0,>=1.21; extra == 'llm'
@@ -83,7 +84,9 @@ The package is split into subpackages, so you can install only the parts you nee
83
84
 
84
85
  ### Data
85
86
  1. `pip install not_again_ai[data]`
86
- 1. `playwright install` to download the browser binaries.
87
+ 1. `crawl4ai-setup` to run crawl4ai post-installation setup.
88
+ 1. Set the `BRAVE_SEARCH_API_KEY` environment variable to use the Brave Search API for web data extraction.
89
+ 1. Get the API key from https://api-dashboard.search.brave.com/app/keys. You must have at least the Free "Data for Search" subscription.
87
90
 
88
91
 
89
92
  ### LLM
@@ -312,3 +315,5 @@ Default settings are configured in [`.vscode/settings.json`](./.vscode/settings.
312
315
 
313
316
  # Attributions
314
317
  [python-blueprint](https://github.com/johnthagen/python-blueprint) for the Python package skeleton.
318
+
319
+ This project uses Crawl4AI (https://github.com/unclecode/crawl4ai) for web data extraction.
@@ -39,7 +39,9 @@ The package is split into subpackages, so you can install only the parts you nee
39
39
 
40
40
  ### Data
41
41
  1. `pip install not_again_ai[data]`
42
- 1. `playwright install` to download the browser binaries.
42
+ 1. `crawl4ai-setup` to run crawl4ai post-installation setup.
43
+ 1. Set the `BRAVE_SEARCH_API_KEY` environment variable to use the Brave Search API for web data extraction.
44
+ 1. Get the API key from https://api-dashboard.search.brave.com/app/keys. You must have at least the Free "Data for Search" subscription.
43
45
 
44
46
 
45
47
  ### LLM
@@ -268,3 +270,5 @@ Default settings are configured in [`.vscode/settings.json`](./.vscode/settings.
268
270
 
269
271
  # Attributions
270
272
  [python-blueprint](https://github.com/johnthagen/python-blueprint) for the Python package skeleton.
273
+
274
+ This project uses Crawl4AI (https://github.com/unclecode/crawl4ai) for web data extraction.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "not-again-ai"
3
- version = "0.19.0"
3
+ version = "0.20.0"
4
4
  description = "Designed to once and for all collect all the little things that come up over and over again in AI projects and put them in one place."
5
5
  authors = [
6
6
  { name = "DaveCoDev", email = "dave.co.dev@gmail.com" }
@@ -34,8 +34,9 @@ Repository = "https://github.com/DaveCoDev/not-again-ai"
34
34
 
35
35
  [project.optional-dependencies]
36
36
  data = [
37
- "playwright>=1.51,<2.0",
38
- "pytest-playwright>=0.7,<1.0",
37
+ "Crawl4AI>=0.6,<1.0",
38
+ "httpx>=0.28,<1.0",
39
+ "markitdown[pdf]==0.1.2"
39
40
  ]
40
41
  llm = [
41
42
  "anthropic>=0.50,<1.0",
@@ -140,7 +141,7 @@ filterwarnings = [
140
141
  "error",
141
142
  # Add additional warning suppressions as needed here. For example, if a third-party library
142
143
  # is throwing a deprecation warning that needs to be fixed upstream:
143
- # "ignore::DeprecationWarning:typer",
144
+ "ignore::DeprecationWarning",
144
145
  "ignore::pytest.PytestUnraisableExceptionWarning"
145
146
  ]
146
147
  asyncio_mode = "auto"
@@ -0,0 +1,203 @@
1
+ import os
2
+
3
+ import httpx
4
+ from loguru import logger
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class SearchWebResult(BaseModel):
9
+ title: str
10
+ url: str
11
+ description: str
12
+ netloc: str | None = None
13
+
14
+
15
+ class SearchWebResults(BaseModel):
16
+ results: list[SearchWebResult]
17
+
18
+
19
+ async def search(
20
+ query: str,
21
+ count: int = 20,
22
+ offset: int = 0,
23
+ country: str = "US",
24
+ search_lang: str = "en",
25
+ ui_lang: str = "en-US",
26
+ freshness: str | None = None,
27
+ timezone: str = "America/New_York",
28
+ state: str = "MA",
29
+ user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.",
30
+ ) -> SearchWebResults:
31
+ """
32
+ Search using Brave Search API.
33
+
34
+ Args:
35
+ query: The search query string
36
+ count: Number of search results to return (1-20, default 10)
37
+ offset: Number of search results to skip (default 0)
38
+ country: Country code for search results (default "US")
39
+ search_lang: Language for search (default "en")
40
+ ui_lang: User interface language (default "en-US")
41
+ freshness: Freshness of results ("pd", "pw", "pm", "py" or YYYY-MM-DDtoYYYY-MM-DD or None)
42
+ timezone: Timezone for search results (default "America/New_York")
43
+ state: State for search results (default "MA")
44
+ user_agent: User agent string for the request (default is a common browser UA)
45
+
46
+ Returns:
47
+ SearchWebResults: A model containing the search results
48
+
49
+ Raises:
50
+ httpx.HTTPError: If the request fails
51
+ ValueError: If BRAVE_SEARCH_API_KEY is not set
52
+ """
53
+ api_key = os.getenv("BRAVE_SEARCH_API_KEY")
54
+ if not api_key:
55
+ raise ValueError("BRAVE_SEARCH_API_KEY environment variable is not set")
56
+
57
+ url = "https://api.search.brave.com/res/v1/web/search"
58
+
59
+ headers = {
60
+ "Accept": "application/json",
61
+ "Accept-Encoding": "gzip",
62
+ "X-Subscription-Token": api_key,
63
+ "X-Loc-Country": country,
64
+ "X-Loc-Timezone": timezone,
65
+ "X-Loc-State": state,
66
+ "User-Agent": user_agent,
67
+ }
68
+
69
+ params: dict[str, str | int | bool] = {
70
+ "q": query,
71
+ "count": count,
72
+ "offset": offset,
73
+ "country": country,
74
+ "search_lang": search_lang,
75
+ "ui_lang": ui_lang,
76
+ "text_decorations": False,
77
+ "spellcheck": False,
78
+ "units": "imperial",
79
+ "extra_snippets": False,
80
+ "safesearch": "off",
81
+ }
82
+
83
+ # Add optional parameters if provided
84
+ if freshness:
85
+ params["freshness"] = freshness
86
+
87
+ try:
88
+ async with httpx.AsyncClient() as client:
89
+ response = await client.get(url, headers=headers, params=params)
90
+ response.raise_for_status()
91
+ data = response.json()
92
+ results_list: list[SearchWebResult] = []
93
+ for item in data.get("web", {}).get("results", []):
94
+ result = SearchWebResult(
95
+ title=item.get("title", ""),
96
+ url=item.get("url", ""),
97
+ description=item.get("snippet", ""),
98
+ netloc=item.get("meta_url", {}).get("netloc", None),
99
+ )
100
+ results_list.append(result)
101
+ return SearchWebResults(results=results_list)
102
+
103
+ except httpx.HTTPError as e:
104
+ logger.error(f"HTTP error during Brave search: {e}")
105
+ raise
106
+ except Exception as e:
107
+ logger.error(f"Unexpected error during Brave search: {e}")
108
+ raise
109
+
110
+
111
+ class SearchNewsResult(BaseModel):
112
+ title: str
113
+ url: str
114
+ description: str
115
+ age: str
116
+ netloc: str | None = None
117
+
118
+
119
+ class SearchNewsResults(BaseModel):
120
+ results: list[SearchNewsResult]
121
+
122
+
123
+ async def search_news(
124
+ query: str,
125
+ count: int = 20,
126
+ offset: int = 0,
127
+ country: str = "US",
128
+ search_lang: str = "en",
129
+ ui_lang: str = "en-US",
130
+ freshness: str | None = None,
131
+ user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.",
132
+ ) -> SearchNewsResults:
133
+ """
134
+ Search news using Brave News Search API.
135
+
136
+ Args:
137
+ query: The search query string
138
+ count: Number of news results to return (1-20, default 20)
139
+ offset: Number of search results to skip (default 0)
140
+ country: Country code for search results (default "US")
141
+ search_lang: Language for search (default "en")
142
+ ui_lang: User interface language (default "en-US")
143
+ freshness: Freshness of results ("pd", "pw", "pm", "py" or YYYY-MM-DDtoYYYY-MM-DD or None)
144
+ user_agent: User agent string for the request (default is a common browser UA)
145
+
146
+ Returns:
147
+ SearchNewsResults: A model containing the news search results
148
+
149
+ Raises:
150
+ httpx.HTTPError: If the request fails
151
+ ValueError: If BRAVE_SEARCH_API_KEY is not set
152
+ """
153
+ api_key = os.getenv("BRAVE_SEARCH_API_KEY")
154
+ if not api_key:
155
+ raise ValueError("BRAVE_SEARCH_API_KEY environment variable is not set")
156
+
157
+ url = "https://api.search.brave.com/res/v1/news/search"
158
+
159
+ headers = {
160
+ "Accept": "application/json",
161
+ "Accept-Encoding": "gzip",
162
+ "X-Subscription-Token": api_key,
163
+ "User-Agent": user_agent,
164
+ }
165
+
166
+ params: dict[str, str | int | bool] = {
167
+ "q": query,
168
+ "count": count,
169
+ "offset": offset,
170
+ "country": country,
171
+ "search_lang": search_lang,
172
+ "ui_lang": ui_lang,
173
+ "spellcheck": False,
174
+ "safesearch": "off",
175
+ }
176
+
177
+ # Add optional parameters if provided
178
+ if freshness:
179
+ params["freshness"] = freshness
180
+
181
+ try:
182
+ async with httpx.AsyncClient() as client:
183
+ response = await client.get(url, headers=headers, params=params)
184
+ response.raise_for_status()
185
+ data = response.json()
186
+ results_list: list[SearchNewsResult] = []
187
+ for item in data.get("results", []):
188
+ result = SearchNewsResult(
189
+ title=item.get("title", ""),
190
+ url=item.get("url", ""),
191
+ description=item.get("description", ""),
192
+ age=item.get("age"),
193
+ netloc=item.get("meta_url", {}).get("netloc", None),
194
+ )
195
+ results_list.append(result)
196
+ return SearchNewsResults(results=results_list)
197
+
198
+ except httpx.HTTPError as e:
199
+ logger.error(f"HTTP error during Brave news search: {e}")
200
+ raise
201
+ except Exception as e:
202
+ logger.error(f"Unexpected error during Brave news search: {e}")
203
+ raise
@@ -0,0 +1,160 @@
1
+ import asyncio
2
+ import io
3
+ import mimetypes
4
+ from pathlib import Path
5
+ import re
6
+ from urllib.parse import urlparse
7
+
8
+ from crawl4ai import AsyncWebCrawler, CacheMode
9
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
10
+ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
11
+ import httpx
12
+ from markitdown import MarkItDown, StreamInfo
13
+ from pydantic import BaseModel
14
+
15
+
16
+ class Link(BaseModel):
17
+ url: str
18
+ text: str
19
+
20
+
21
+ class URLResult(BaseModel):
22
+ url: str
23
+ markdown: str
24
+ links: list[Link] = []
25
+
26
+
27
+ async def _markitdown_bytes_to_str(file_bytes: bytes, filename_extension: str) -> str:
28
+ """
29
+ Convert a file using MarkItDown defaults.
30
+ """
31
+ with io.BytesIO(file_bytes) as temp:
32
+ result = await asyncio.to_thread(
33
+ MarkItDown(enable_plugins=False).convert,
34
+ source=temp,
35
+ stream_info=StreamInfo(extension=filename_extension),
36
+ )
37
+ text = result.text_content
38
+ return text
39
+
40
+
41
+ def _detect_pdf_extension(url: str) -> bool:
42
+ """
43
+ Detect if the URL is a PDF based on its extension.
44
+ """
45
+ parsed_url = urlparse(url)
46
+ filename = Path(parsed_url.path).name
47
+ return mimetypes.guess_type(filename)[0] == "application/pdf"
48
+
49
+
50
+ def _detect_google_sheets(url: str) -> bool:
51
+ """
52
+ Detect if the URL is a Google Sheets document.
53
+ """
54
+ is_google_sheets = url.startswith("https://docs.google.com/spreadsheets/")
55
+ return is_google_sheets
56
+
57
+
58
+ async def _handle_pdf_content(url: str) -> URLResult:
59
+ md = MarkItDown(enable_plugins=False)
60
+ result = md.convert(url)
61
+ url_result = URLResult(
62
+ url=url,
63
+ markdown=result.markdown or "",
64
+ links=[],
65
+ )
66
+ return url_result
67
+
68
+
69
+ async def _handle_google_sheets_content(url: str) -> URLResult:
70
+ """
71
+ Handle Google Sheets by using the export URL to get the raw content.
72
+ """
73
+ edit_pattern = r"https://docs\.google\.com/spreadsheets/d/([a-zA-Z0-9-_]+)/edit"
74
+ export_pattern = r"https://docs\.google\.com/spreadsheets/d/([a-zA-Z0-9-_]+)/export\?format=csv"
75
+
76
+ # Check if it's already an export URL
77
+ export_match = re.search(export_pattern, url)
78
+ if export_match:
79
+ export_url = url
80
+ else:
81
+ # Check if it's an edit URL and extract document ID
82
+ edit_match = re.search(edit_pattern, url)
83
+ if edit_match:
84
+ doc_id = edit_match.group(1)
85
+ export_url = f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&gid=0"
86
+ else:
87
+ return await _handle_web_content(url)
88
+
89
+ async with httpx.AsyncClient(follow_redirects=True) as client:
90
+ response = await client.get(export_url)
91
+ response.raise_for_status()
92
+ csv_bytes = response.content
93
+
94
+ # Convert CSV to markdown using MarkItDown
95
+ markdown_content = await _markitdown_bytes_to_str(csv_bytes, ".csv")
96
+
97
+ url_result = URLResult(
98
+ url=url,
99
+ markdown=markdown_content,
100
+ links=[],
101
+ )
102
+ return url_result
103
+
104
+
105
+ async def _handle_web_content(url: str) -> URLResult:
106
+ browser_config = BrowserConfig(
107
+ browser_type="chromium",
108
+ headless=True,
109
+ verbose=False,
110
+ user_agent_mode="random",
111
+ java_script_enabled=True,
112
+ )
113
+ run_config = CrawlerRunConfig(
114
+ scan_full_page=True,
115
+ user_agent_mode="random",
116
+ cache_mode=CacheMode.DISABLED,
117
+ markdown_generator=DefaultMarkdownGenerator(),
118
+ )
119
+
120
+ async with AsyncWebCrawler(config=browser_config) as crawler:
121
+ result = await crawler.arun(
122
+ url=url,
123
+ config=run_config,
124
+ )
125
+
126
+ if result.response_headers.get("content-type") == "application/pdf":
127
+ return await _handle_pdf_content(url)
128
+
129
+ links: list[Link] = []
130
+ seen_urls: set[str] = set()
131
+ combined_link_data = result.links.get("internal", []) + result.links.get("external", [])
132
+ for link_data in combined_link_data:
133
+ href = link_data.get("href", "")
134
+ if href and href not in seen_urls:
135
+ seen_urls.add(href)
136
+ link = Link(
137
+ url=href,
138
+ text=link_data.get("title", "") or link_data.get("text", ""),
139
+ )
140
+ links.append(link)
141
+
142
+ url_result = URLResult(
143
+ url=url,
144
+ markdown=result.markdown or "",
145
+ links=links,
146
+ )
147
+ return url_result
148
+
149
+
150
+ async def process_url(url: str) -> URLResult:
151
+ """
152
+ Process a URL to extract content and convert it to Markdown and links
153
+ """
154
+ if _detect_pdf_extension(url):
155
+ url_result = await _handle_pdf_content(url)
156
+ elif _detect_google_sheets(url):
157
+ url_result = await _handle_google_sheets_content(url)
158
+ else:
159
+ url_result = await _handle_web_content(url)
160
+ return url_result
@@ -0,0 +1,34 @@
1
+ from typing import Any
2
+
3
+ import pytest
4
+
5
+ from not_again_ai.data.brave_search_api import search, search_news
6
+
7
+
8
+ @pytest.mark.parametrize(
9
+ ("query", "search_params"),
10
+ [
11
+ ("brave search", {}),
12
+ ("python programming", {"count": 2, "country": "US"}),
13
+ ("machine learning", {"count": 4, "search_lang": "en", "freshness": "pw"}),
14
+ ("AI news", {"count": 1, "offset": 5, "country": "GB", "ui_lang": "en-GB"}),
15
+ ],
16
+ )
17
+ async def test_brave_search_api(query: str, search_params: dict[str, Any]) -> None:
18
+ """Test the Brave Search API with a sample query and optional parameters."""
19
+ content = await search(query=query, **search_params)
20
+ assert content.results, f"No results returned for query: {query}"
21
+
22
+
23
+ @pytest.mark.skip("API Cost")
24
+ @pytest.mark.parametrize(
25
+ ("query", "search_params"),
26
+ [
27
+ ("latest tech news", {}),
28
+ ("AI breakthrough", {"count": 3, "country": "US"}),
29
+ ],
30
+ )
31
+ async def test_brave_search_news_api(query: str, search_params: dict[str, Any]) -> None:
32
+ """Test the Brave News Search API with a sample query and optional parameters."""
33
+ content = await search_news(query=query, **search_params)
34
+ assert content.results, f"No news results returned for query: {query}"
@@ -0,0 +1,20 @@
1
+ import pytest
2
+
3
+ from not_again_ai.data.web import process_url
4
+
5
+
6
+ @pytest.mark.parametrize(
7
+ "url",
8
+ [
9
+ "https://example.com",
10
+ "https://github.com/unclecode/crawl4ai",
11
+ "https://arxiv.org/pdf/1710.02298",
12
+ "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
13
+ "https://www.nascar.com/news/nascar-craftsman-truck-series/",
14
+ "https://docs.google.com/spreadsheets/d/1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms/edit?gid=0#gid=0",
15
+ "https://docs.google.com/spreadsheets/d/1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms/export?format=csv&gid=0",
16
+ ],
17
+ )
18
+ async def test_process_url(url: str) -> None:
19
+ content = await process_url(url)
20
+ assert content, f"Content should not be empty for URL: {url}"
File without changes