noesium 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. noesium/agents/askura_agent/__init__.py +22 -0
  2. noesium/agents/askura_agent/askura_agent.py +480 -0
  3. noesium/agents/askura_agent/conversation.py +164 -0
  4. noesium/agents/askura_agent/extractor.py +175 -0
  5. noesium/agents/askura_agent/memory.py +14 -0
  6. noesium/agents/askura_agent/models.py +239 -0
  7. noesium/agents/askura_agent/prompts.py +202 -0
  8. noesium/agents/askura_agent/reflection.py +234 -0
  9. noesium/agents/askura_agent/summarizer.py +30 -0
  10. noesium/agents/askura_agent/utils.py +6 -0
  11. noesium/agents/deep_research/__init__.py +13 -0
  12. noesium/agents/deep_research/agent.py +398 -0
  13. noesium/agents/deep_research/prompts.py +84 -0
  14. noesium/agents/deep_research/schemas.py +42 -0
  15. noesium/agents/deep_research/state.py +54 -0
  16. noesium/agents/search/__init__.py +5 -0
  17. noesium/agents/search/agent.py +474 -0
  18. noesium/agents/search/state.py +28 -0
  19. noesium/core/__init__.py +1 -1
  20. noesium/core/agent/base.py +10 -2
  21. noesium/core/goalith/decomposer/llm_decomposer.py +1 -1
  22. noesium/core/llm/__init__.py +1 -1
  23. noesium/core/llm/base.py +2 -2
  24. noesium/core/llm/litellm.py +42 -21
  25. noesium/core/llm/llamacpp.py +25 -4
  26. noesium/core/llm/ollama.py +43 -22
  27. noesium/core/llm/openai.py +25 -5
  28. noesium/core/llm/openrouter.py +1 -1
  29. noesium/core/toolify/base.py +9 -2
  30. noesium/core/toolify/config.py +2 -2
  31. noesium/core/toolify/registry.py +21 -5
  32. noesium/core/tracing/opik_tracing.py +7 -7
  33. noesium/core/vector_store/__init__.py +2 -2
  34. noesium/core/vector_store/base.py +1 -1
  35. noesium/core/vector_store/pgvector.py +10 -13
  36. noesium/core/vector_store/weaviate.py +2 -1
  37. noesium/toolkits/__init__.py +1 -0
  38. noesium/toolkits/arxiv_toolkit.py +310 -0
  39. noesium/toolkits/audio_aliyun_toolkit.py +441 -0
  40. noesium/toolkits/audio_toolkit.py +370 -0
  41. noesium/toolkits/bash_toolkit.py +332 -0
  42. noesium/toolkits/document_toolkit.py +454 -0
  43. noesium/toolkits/file_edit_toolkit.py +552 -0
  44. noesium/toolkits/github_toolkit.py +395 -0
  45. noesium/toolkits/gmail_toolkit.py +575 -0
  46. noesium/toolkits/image_toolkit.py +425 -0
  47. noesium/toolkits/memory_toolkit.py +398 -0
  48. noesium/toolkits/python_executor_toolkit.py +334 -0
  49. noesium/toolkits/search_toolkit.py +451 -0
  50. noesium/toolkits/serper_toolkit.py +623 -0
  51. noesium/toolkits/tabular_data_toolkit.py +537 -0
  52. noesium/toolkits/user_interaction_toolkit.py +365 -0
  53. noesium/toolkits/video_toolkit.py +168 -0
  54. noesium/toolkits/wikipedia_toolkit.py +420 -0
  55. noesium-0.2.1.dist-info/METADATA +253 -0
  56. {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/RECORD +59 -23
  57. {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/licenses/LICENSE +1 -1
  58. noesium-0.1.0.dist-info/METADATA +0 -525
  59. {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/WHEEL +0 -0
  60. {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,537 @@
1
+ """
2
+ Tabular data toolkit for analyzing and processing structured data files.
3
+
4
+ Provides tools for reading, analyzing, and extracting insights from various
5
+ tabular data formats including CSV, Excel, JSON, and Parquet files.
6
+ """
7
+
8
+ import json
9
+ import math
10
+ import os
11
+ from pathlib import Path
12
+ from typing import Callable, Dict, List, Optional
13
+
14
+ from noesium.core.toolify.base import AsyncBaseToolkit
15
+ from noesium.core.toolify.config import ToolkitConfig
16
+ from noesium.core.toolify.registry import register_toolkit
17
+ from noesium.core.utils.logging import get_logger
18
+
19
+ logger = get_logger(__name__)
20
+
21
+ try:
22
+ import pandas as pd
23
+
24
+ PANDAS_AVAILABLE = True
25
+ except ImportError:
26
+ pd = None
27
+ PANDAS_AVAILABLE = False
28
+
29
+ # Template for column analysis
30
+ COLUMN_ANALYSIS_TEMPLATE = """You are a data analysis agent that extracts and summarizes data structure information from tabular data files (CSV, Excel, etc.).
31
+
32
+ <column_info>
33
+ {column_info}
34
+ </column_info>
35
+
36
+ You should extract the file structure (e.g. the delimiter), and provide detailed column information (e.g. column_name, type, column explanation and sample values) for each column.
37
+
38
+ <output_format>
39
+ ### File Structure
40
+ - Delimiter: <the delimiter used in the file, e.g. ',', '\\t', ' '>
41
+
42
+ ### Columns
43
+ | Column Name | Type | Explanation | Sample Value |
44
+ |-------------|------|-------------|--------------|
45
+ | name_of_column1 | type_of_column1 | explanation_of_column1, i.e. what the column represents | sample_value_of_column1 |
46
+ | name_of_column2 | type_of_column2 | explanation_of_column2, i.e. what the column represents | sample_value_of_column2 |
47
+ | ... | ... | ... | ... |
48
+ </output_format>"""
49
+
50
+
51
+ @register_toolkit("tabular_data")
52
+ class TabularDataToolkit(AsyncBaseToolkit):
53
+ """
54
+ Toolkit for tabular data analysis and processing.
55
+
56
+ This toolkit provides capabilities for:
57
+ - Reading various tabular data formats (CSV, Excel, JSON, Parquet, TSV)
58
+ - Analyzing data structure and column information
59
+ - Extracting metadata and statistics
60
+ - LLM-powered data interpretation
61
+ - Data quality assessment
62
+
63
+ Features:
64
+ - Multi-format support with automatic encoding detection
65
+ - Intelligent column analysis and interpretation
66
+ - Statistical summaries and data profiling
67
+ - LLM-powered column explanation generation
68
+ - Data validation and quality checks
69
+ - Memory-efficient processing for large files
70
+
71
+ Supported Formats:
72
+ - **CSV**: Comma-separated values with encoding detection
73
+ - **Excel**: XLSX and XLS files with multiple sheets
74
+ - **JSON**: JSON files with tabular structure
75
+ - **Parquet**: Columnar storage format
76
+ - **TSV**: Tab-separated values
77
+ - **Generic**: Auto-detection for other delimited formats
78
+
79
+ Required dependency: pandas
80
+ Install with: pip install pandas
81
+ """
82
+
83
+ def __init__(self, config: ToolkitConfig = None):
84
+ """
85
+ Initialize the tabular data toolkit.
86
+
87
+ Args:
88
+ config: Toolkit configuration
89
+
90
+ Raises:
91
+ ImportError: If pandas is not installed
92
+ """
93
+ super().__init__(config)
94
+
95
+ if not PANDAS_AVAILABLE:
96
+ raise ImportError("pandas is required for TabularDataToolkit. " "Install with: pip install pandas")
97
+
98
+ # Configuration
99
+ self.max_file_size = self.config.config.get("max_file_size", 100 * 1024 * 1024) # 100MB
100
+ self.max_rows_preview = self.config.config.get("max_rows_preview", 1000)
101
+ self.cache_dir = Path(self.config.config.get("cache_dir", "./tabular_cache"))
102
+
103
+ # Create cache directory
104
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
105
+
106
+ self.logger.info("Tabular data toolkit initialized")
107
+
108
+ def _load_tabular_data(self, file_path: str, **kwargs) -> pd.DataFrame:
109
+ """
110
+ Load tabular data from a file with automatic format detection.
111
+
112
+ Args:
113
+ file_path: Path to the data file
114
+ **kwargs: Additional parameters for pandas readers
115
+
116
+ Returns:
117
+ DataFrame containing the tabular data
118
+
119
+ Raises:
120
+ Exception: If the file cannot be loaded as tabular data
121
+ """
122
+ file_path = Path(file_path)
123
+
124
+ # Check file size
125
+ if file_path.stat().st_size > self.max_file_size:
126
+ raise ValueError(f"File too large ({file_path.stat().st_size} bytes, max: {self.max_file_size})")
127
+
128
+ file_ext = file_path.suffix.lower()
129
+
130
+ try:
131
+ if file_ext == ".csv":
132
+ # Try different encodings for CSV files
133
+ encodings = ["utf-8", "latin1", "cp1252", "iso-8859-1"]
134
+ for encoding in encodings:
135
+ try:
136
+ df = pd.read_csv(file_path, encoding=encoding, **kwargs)
137
+ self.logger.info(f"Successfully loaded CSV with {encoding} encoding")
138
+ return df
139
+ except UnicodeDecodeError:
140
+ continue
141
+ raise Exception("Could not read CSV file with any supported encoding")
142
+
143
+ elif file_ext in [".xlsx", ".xls"]:
144
+ df = pd.read_excel(file_path, **kwargs)
145
+ self.logger.info(f"Successfully loaded Excel file")
146
+ return df
147
+
148
+ elif file_ext == ".json":
149
+ df = pd.read_json(file_path, **kwargs)
150
+ self.logger.info(f"Successfully loaded JSON file")
151
+ return df
152
+
153
+ elif file_ext == ".parquet":
154
+ df = pd.read_parquet(file_path, **kwargs)
155
+ self.logger.info(f"Successfully loaded Parquet file")
156
+ return df
157
+
158
+ elif file_ext == ".tsv":
159
+ # Tab-separated values
160
+ encodings = ["utf-8", "latin1", "cp1252", "iso-8859-1"]
161
+ for encoding in encodings:
162
+ try:
163
+ df = pd.read_csv(file_path, sep="\t", encoding=encoding, **kwargs)
164
+ self.logger.info(f"Successfully loaded TSV with {encoding} encoding")
165
+ return df
166
+ except UnicodeDecodeError:
167
+ continue
168
+ raise Exception("Could not read TSV file with any supported encoding")
169
+
170
+ else:
171
+ # Try to read as CSV by default
172
+ try:
173
+ df = pd.read_csv(file_path, **kwargs)
174
+ self.logger.info(f"Successfully loaded file as CSV")
175
+ return df
176
+ except Exception as e:
177
+ raise Exception(f"Unsupported file format: {file_ext}") from e
178
+
179
+ except Exception as e:
180
+ self.logger.error(f"Failed to load tabular data: {e}")
181
+ raise
182
+
183
+ def _extract_column_info(self, df: pd.DataFrame, return_features: Optional[List[str]] = None) -> List[Dict]:
184
+ """
185
+ Extract detailed information about DataFrame columns.
186
+
187
+ Args:
188
+ df: DataFrame to analyze
189
+ return_features: List of features to include in output
190
+
191
+ Returns:
192
+ List of dictionaries containing column information
193
+ """
194
+ column_info = []
195
+
196
+ for col in df.columns:
197
+ try:
198
+ # Get data type
199
+ dtype = str(df[col].dtype)
200
+
201
+ # Get sample value (first non-null value)
202
+ sample_value = None
203
+ non_null_values = df[col].dropna()
204
+
205
+ if len(non_null_values) > 0:
206
+ sample_value = non_null_values.iloc[0]
207
+
208
+ # Handle different data types for display
209
+ if pd.isna(sample_value):
210
+ sample_str = "NaN"
211
+ elif isinstance(sample_value, float):
212
+ if math.isnan(sample_value):
213
+ sample_str = "NaN"
214
+ else:
215
+ sample_str = str(sample_value)
216
+ else:
217
+ sample_str = str(sample_value)
218
+ else:
219
+ sample_str = "No data"
220
+
221
+ # Additional statistics
222
+ null_count = df[col].isnull().sum()
223
+ null_percentage = (null_count / len(df)) * 100
224
+ unique_count = df[col].nunique()
225
+
226
+ col_info = {
227
+ "column_name": str(col),
228
+ "type": dtype,
229
+ "sample": sample_str,
230
+ "null_count": int(null_count),
231
+ "null_percentage": round(null_percentage, 2),
232
+ "unique_count": int(unique_count),
233
+ "total_rows": len(df),
234
+ }
235
+
236
+ # Add numeric statistics for numeric columns
237
+ if pd.api.types.is_numeric_dtype(df[col]):
238
+ col_info.update(
239
+ {
240
+ "mean": df[col].mean() if not df[col].empty else None,
241
+ "std": df[col].std() if not df[col].empty else None,
242
+ "min": df[col].min() if not df[col].empty else None,
243
+ "max": df[col].max() if not df[col].empty else None,
244
+ }
245
+ )
246
+
247
+ column_info.append(col_info)
248
+
249
+ except Exception as e:
250
+ self.logger.warning(f"Error processing column '{col}': {e}")
251
+ column_info.append(
252
+ {"column_name": str(col), "type": "unknown", "sample": "Error reading sample", "error": str(e)}
253
+ )
254
+
255
+ return column_info
256
+
257
+ def _format_column_info(self, column_info: List[Dict], return_features: Optional[List[str]] = None) -> str:
258
+ """
259
+ Format column information as a readable string.
260
+
261
+ Args:
262
+ column_info: List of column information dictionaries
263
+ return_features: Features to include in output
264
+
265
+ Returns:
266
+ Formatted string representation
267
+ """
268
+ if not column_info:
269
+ return "No columns found"
270
+
271
+ if "error" in column_info[0]:
272
+ return column_info[0]["error"]
273
+
274
+ lines = []
275
+ default_features = ["column_name", "type", "sample", "null_percentage", "unique_count"]
276
+ features_to_show = return_features if return_features else default_features
277
+
278
+ for i, col in enumerate(column_info):
279
+ # Filter features to show
280
+ filtered_info = {k: col[k] for k in features_to_show if k in col}
281
+
282
+ lines.append(f"- Column {i + 1}: {json.dumps(filtered_info, ensure_ascii=False, default=str)}")
283
+
284
+ return "\n".join(lines)
285
+
286
+ async def get_tabular_columns(self, file_path: str, return_features: Optional[List[str]] = None) -> str:
287
+ """
288
+ Extract raw column metadata from tabular data files.
289
+
290
+ This tool directly reads tabular data files and returns basic column
291
+ information including names, data types, sample values, and statistics.
292
+ It's useful for understanding the structure of data files before analysis.
293
+
294
+ Args:
295
+ file_path: Path to the tabular data file
296
+ return_features: List of features to include (column_name, type, sample,
297
+ null_count, null_percentage, unique_count, mean, std, min, max)
298
+
299
+ Returns:
300
+ Formatted string with column information and statistics
301
+
302
+ Supported formats: CSV, Excel (XLSX/XLS), JSON, Parquet, TSV
303
+
304
+ Example:
305
+ info = await get_tabular_columns("data.csv", ["column_name", "type", "sample"])
306
+ """
307
+ self.logger.info(f"Analyzing tabular columns for: {file_path}")
308
+
309
+ if not os.path.exists(file_path):
310
+ return f"Error: File '{file_path}' does not exist."
311
+
312
+ try:
313
+ # Load the data
314
+ df = self._load_tabular_data(file_path)
315
+
316
+ # Extract column information
317
+ column_info = self._extract_column_info(df, return_features)
318
+
319
+ # Format and return
320
+ result = self._format_column_info(column_info, return_features)
321
+
322
+ self.logger.info(f"Successfully analyzed {len(column_info)} columns")
323
+ return result
324
+
325
+ except Exception as e:
326
+ error_msg = f"Error analyzing file '{file_path}': {str(e)}"
327
+ self.logger.error(error_msg)
328
+ return error_msg
329
+
330
+ async def get_column_info(self, file_path: str) -> str:
331
+ """
332
+ Get intelligent analysis and interpretation of column information.
333
+
334
+ This tool builds on get_tabular_columns() to provide LLM-powered analysis
335
+ of the data structure, including file format detection and column meaning
336
+ interpretation. It's useful for understanding what the data represents.
337
+
338
+ Args:
339
+ file_path: Path to the tabular data file
340
+
341
+ Returns:
342
+ Detailed analysis with file structure and column explanations
343
+
344
+ Example:
345
+ analysis = await get_column_info("sales_data.csv")
346
+ # Returns structured analysis with column explanations
347
+ """
348
+ self.logger.info(f"Getting intelligent column analysis for: {file_path}")
349
+
350
+ try:
351
+ # Get raw column information
352
+ column_info_str = await self.get_tabular_columns(file_path)
353
+
354
+ if column_info_str.startswith("Error:"):
355
+ return column_info_str
356
+
357
+ # Use LLM for intelligent analysis
358
+ prompt = COLUMN_ANALYSIS_TEMPLATE.format(column_info=column_info_str)
359
+
360
+ response = self.llm_client.completion(
361
+ messages=[
362
+ {
363
+ "role": "system",
364
+ "content": "You are a data analysis expert specializing in tabular data structure analysis.",
365
+ },
366
+ {"role": "user", "content": prompt},
367
+ ],
368
+ temperature=0.1,
369
+ max_tokens=1500,
370
+ )
371
+
372
+ return response.strip()
373
+
374
+ except Exception as e:
375
+ error_msg = f"Error during intelligent analysis: {str(e)}"
376
+ self.logger.error(error_msg)
377
+ return error_msg
378
+
379
+ async def get_data_summary(self, file_path: str, max_rows: Optional[int] = None) -> str:
380
+ """
381
+ Get a comprehensive summary of the tabular data.
382
+
383
+ Args:
384
+ file_path: Path to the tabular data file
385
+ max_rows: Maximum number of rows to analyze (default: 1000)
386
+
387
+ Returns:
388
+ Comprehensive data summary including statistics and insights
389
+ """
390
+ self.logger.info(f"Generating data summary for: {file_path}")
391
+
392
+ max_rows = max_rows or self.max_rows_preview
393
+
394
+ try:
395
+ # Load data (with row limit for large files)
396
+ df = self._load_tabular_data(file_path, nrows=max_rows)
397
+
398
+ summary_lines = [
399
+ f"Data Summary for: {file_path}",
400
+ "=" * 50,
401
+ "",
402
+ f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns",
403
+ f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB",
404
+ "",
405
+ ]
406
+
407
+ # Data types summary
408
+ type_counts = df.dtypes.value_counts()
409
+ summary_lines.append("Data Types:")
410
+ for dtype, count in type_counts.items():
411
+ summary_lines.append(f" {dtype}: {count} columns")
412
+ summary_lines.append("")
413
+
414
+ # Missing data summary
415
+ missing_data = df.isnull().sum()
416
+ missing_cols = missing_data[missing_data > 0]
417
+ if len(missing_cols) > 0:
418
+ summary_lines.append("Missing Data:")
419
+ for col, count in missing_cols.items():
420
+ percentage = (count / len(df)) * 100
421
+ summary_lines.append(f" {col}: {count} ({percentage:.1f}%)")
422
+ else:
423
+ summary_lines.append("Missing Data: None")
424
+ summary_lines.append("")
425
+
426
+ # Numeric columns summary
427
+ numeric_cols = df.select_dtypes(include=["number"]).columns
428
+ if len(numeric_cols) > 0:
429
+ summary_lines.append("Numeric Columns Summary:")
430
+ desc = df[numeric_cols].describe()
431
+ summary_lines.append(desc.to_string())
432
+ summary_lines.append("")
433
+
434
+ # Categorical columns summary
435
+ categorical_cols = df.select_dtypes(include=["object", "category"]).columns
436
+ if len(categorical_cols) > 0:
437
+ summary_lines.append("Categorical Columns (Top Values):")
438
+ for col in categorical_cols[:5]: # Limit to first 5 categorical columns
439
+ top_values = df[col].value_counts().head(3)
440
+ summary_lines.append(f" {col}:")
441
+ for value, count in top_values.items():
442
+ summary_lines.append(f" {value}: {count}")
443
+ summary_lines.append("")
444
+
445
+ # Data quality indicators
446
+ summary_lines.append("Data Quality Indicators:")
447
+ duplicate_rows = df.duplicated().sum()
448
+ summary_lines.append(f" Duplicate rows: {duplicate_rows}")
449
+
450
+ # Unique values per column
451
+ unique_ratios = df.nunique() / len(df)
452
+ high_cardinality = unique_ratios[unique_ratios > 0.9].index.tolist()
453
+ if high_cardinality:
454
+ summary_lines.append(f" High cardinality columns: {', '.join(high_cardinality)}")
455
+
456
+ return "\n".join(summary_lines)
457
+
458
+ except Exception as e:
459
+ error_msg = f"Error generating data summary: {str(e)}"
460
+ self.logger.error(error_msg)
461
+ return error_msg
462
+
463
+ async def validate_data_quality(self, file_path: str) -> str:
464
+ """
465
+ Perform data quality validation and return a report.
466
+
467
+ Args:
468
+ file_path: Path to the tabular data file
469
+
470
+ Returns:
471
+ Data quality validation report
472
+ """
473
+ self.logger.info(f"Validating data quality for: {file_path}")
474
+
475
+ try:
476
+ df = self._load_tabular_data(file_path, nrows=self.max_rows_preview)
477
+
478
+ issues = []
479
+
480
+ # Check for missing data
481
+ missing_data = df.isnull().sum()
482
+ high_missing = missing_data[missing_data > len(df) * 0.5]
483
+ if len(high_missing) > 0:
484
+ issues.append(f"High missing data (>50%): {list(high_missing.index)}")
485
+
486
+ # Check for duplicate rows
487
+ duplicates = df.duplicated().sum()
488
+ if duplicates > 0:
489
+ issues.append(f"Duplicate rows found: {duplicates}")
490
+
491
+ # Check for columns with single value
492
+ single_value_cols = [col for col in df.columns if df[col].nunique() <= 1]
493
+ if single_value_cols:
494
+ issues.append(f"Columns with single/no values: {single_value_cols}")
495
+
496
+ # Check for potential ID columns (high cardinality)
497
+ potential_ids = [col for col in df.columns if df[col].nunique() == len(df)]
498
+ if potential_ids:
499
+ issues.append(f"Potential ID columns (unique values): {potential_ids}")
500
+
501
+ # Check for mixed data types in object columns
502
+ mixed_type_issues = []
503
+ for col in df.select_dtypes(include=["object"]).columns:
504
+ sample_types = df[col].dropna().apply(type).unique()
505
+ if len(sample_types) > 1:
506
+ mixed_type_issues.append(col)
507
+ if mixed_type_issues:
508
+ issues.append(f"Columns with mixed data types: {mixed_type_issues}")
509
+
510
+ # Generate report
511
+ if issues:
512
+ report = f"Data Quality Issues Found ({len(issues)}):\n\n"
513
+ for i, issue in enumerate(issues, 1):
514
+ report += f"{i}. {issue}\n"
515
+ else:
516
+ report = "✅ No major data quality issues detected."
517
+
518
+ return report
519
+
520
+ except Exception as e:
521
+ error_msg = f"Error during data quality validation: {str(e)}"
522
+ self.logger.error(error_msg)
523
+ return error_msg
524
+
525
+ async def get_tools_map(self) -> Dict[str, Callable]:
526
+ """
527
+ Get the mapping of tool names to their implementation functions.
528
+
529
+ Returns:
530
+ Dictionary mapping tool names to callable functions
531
+ """
532
+ return {
533
+ "get_tabular_columns": self.get_tabular_columns,
534
+ "get_column_info": self.get_column_info,
535
+ "get_data_summary": self.get_data_summary,
536
+ "validate_data_quality": self.validate_data_quality,
537
+ }