aigroup-stata-mcp 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. aigroup_stata_mcp-1.0.3.dist-info/METADATA +345 -0
  2. aigroup_stata_mcp-1.0.3.dist-info/RECORD +38 -0
  3. aigroup_stata_mcp-1.0.3.dist-info/WHEEL +4 -0
  4. aigroup_stata_mcp-1.0.3.dist-info/entry_points.txt +5 -0
  5. aigroup_stata_mcp-1.0.3.dist-info/licenses/LICENSE +21 -0
  6. stata_mcp/__init__.py +18 -0
  7. stata_mcp/cli/__init__.py +8 -0
  8. stata_mcp/cli/_cli.py +95 -0
  9. stata_mcp/core/__init__.py +14 -0
  10. stata_mcp/core/data_info/__init__.py +11 -0
  11. stata_mcp/core/data_info/_base.py +288 -0
  12. stata_mcp/core/data_info/csv.py +123 -0
  13. stata_mcp/core/data_info/dta.py +70 -0
  14. stata_mcp/core/stata/__init__.py +13 -0
  15. stata_mcp/core/stata/stata_controller/__init__.py +9 -0
  16. stata_mcp/core/stata/stata_controller/controller.py +208 -0
  17. stata_mcp/core/stata/stata_do/__init__.py +9 -0
  18. stata_mcp/core/stata/stata_do/do.py +177 -0
  19. stata_mcp/core/stata/stata_finder/__init__.py +9 -0
  20. stata_mcp/core/stata/stata_finder/base.py +294 -0
  21. stata_mcp/core/stata/stata_finder/finder.py +193 -0
  22. stata_mcp/core/stata/stata_finder/linux.py +43 -0
  23. stata_mcp/core/stata/stata_finder/macos.py +88 -0
  24. stata_mcp/core/stata/stata_finder/windows.py +191 -0
  25. stata_mcp/server/__init__.py +8 -0
  26. stata_mcp/server/main.py +153 -0
  27. stata_mcp/server/prompts/__init__.py +8 -0
  28. stata_mcp/server/prompts/core_prompts.py +122 -0
  29. stata_mcp/server/tools/__init__.py +10 -0
  30. stata_mcp/server/tools/core_tools.py +59 -0
  31. stata_mcp/server/tools/file_tools.py +163 -0
  32. stata_mcp/server/tools/stata_tools.py +221 -0
  33. stata_mcp/utils/Installer/__init__.py +7 -0
  34. stata_mcp/utils/Installer/installer.py +85 -0
  35. stata_mcp/utils/Prompt/__init__.py +74 -0
  36. stata_mcp/utils/Prompt/string.py +91 -0
  37. stata_mcp/utils/__init__.py +23 -0
  38. stata_mcp/utils/usable.py +244 -0
@@ -0,0 +1,288 @@
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ from abc import ABC, abstractmethod
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List
8
+ from urllib.parse import urlparse
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+
13
+
14
+ class DataInfoBase(ABC):
15
+
16
+
17
+
18
+ def __init__(self,
19
+ data_path: str | Path,
20
+ vars_list: List[str] | str = None,
21
+ *,
22
+ encoding: str = "utf-8",
23
+ cache_info: bool = True,
24
+ cache_dir: str | Path = None,
25
+ **kwargs):
26
+ self.data_path = data_path
27
+ self.encoding = encoding
28
+ self._pre_vars_list = vars_list
29
+ self.cache_info = cache_info
30
+ self.cache_dir = Path(cache_dir) if cache_dir else None
31
+ self.kwargs = kwargs # Store additional keyword arguments for subclasses to use
32
+
33
+ # Properties
34
+ @property
35
+ def df(self) -> pd.DataFrame:
36
+ """Get the data as a pandas DataFrame."""
37
+ return self._read_data()
38
+
39
+ @property
40
+ def vars_list(self) -> List[str]:
41
+ """Get the list of selected variables."""
42
+ return self._get_selected_vars(self._pre_vars_list)
43
+
44
+ @property
45
+ def info(self) -> Dict[str, Any]:
46
+ """Get comprehensive information about the data."""
47
+ return {
48
+ "summary": self.summary(),
49
+ }
50
+
51
+ @property
52
+ def is_url(self) -> bool:
53
+ try:
54
+ result = urlparse(str(self.data_path))
55
+ return all([result.scheme, result.netloc])
56
+ except Exception:
57
+ return False
58
+
59
+ # Abstract methods (must be implemented by subclasses)
60
+ @abstractmethod
61
+ def _read_data(self) -> pd.DataFrame:
62
+ """Read data from the source file. Must be implemented by subclasses."""
63
+ ...
64
+
65
+ # Public methods
66
+ def summary(self) -> Dict[str, Any]:
67
+ """
68
+ Provide a summary of the data.
69
+
70
+ Returns:
71
+ Dict[str, Any]: the summary of provided data (vars)
72
+
73
+ Examples:
74
+ >>> from stata_mcp.core.data_info import DtaDataInfo
75
+ >>> data_info = DtaDataInfo(...)
76
+ >>> summary_data = data_info.summary()
77
+ >>> print(summary_data)
78
+ {
79
+ "overview": {
80
+ "obs": 1314, # Observed numbers
81
+ "var_numbers": 10 # equal to the length of `vars_detail`.
82
+ },
83
+ "vars_detail": {
84
+ "name": {
85
+ "type": "str",
86
+ "obs": 1314,
87
+ "value_list": ["Jack", "Rose", ...] # list 10 random unique value
88
+ },
89
+ "age": {
90
+ "type": "float", # it signed as float no matter the value type is int or float
91
+ "obs": 1314,
92
+ "summary": {
93
+ "mean": 52.1,
94
+ "se": 10.3386,
95
+ "min": 18,
96
+ "max": 100
97
+ }
98
+ },
99
+ "male": {
100
+ "type": "float", # Note: no bool type! It is signed with 0 and 1.
101
+ "obs": 1111, # Note: maybe some obs do not have value (NA), this is not be counted.
102
+ "summary": {
103
+ "mean": 0.49955,
104
+ "se": 0.500225,
105
+ "min": 0,
106
+ "max": 1
107
+ }
108
+ }
109
+ "var_name": {}
110
+ }
111
+ }
112
+ """
113
+ df = self.df
114
+ selected_vars = self.vars_list
115
+
116
+ # 基本概览信息
117
+ overview = {
118
+ "obs": len(df),
119
+ "var_numbers": len(selected_vars)
120
+ }
121
+
122
+ # 详细变量信息
123
+ vars_detail = {}
124
+
125
+ for var_name in selected_vars:
126
+ var_series = df[var_name]
127
+ var_info = DataInfoBase._get_variable_info(var_series)
128
+ vars_detail[var_name] = var_info
129
+
130
+ return {
131
+ "overview": overview,
132
+ "vars_detail": vars_detail
133
+ }
134
+
135
+ # Private helper methods
136
+ def _get_selected_vars(self, vars: List[str] | str = None) -> List[str]:
137
+ """
138
+ Get the list of selected variables.
139
+
140
+ If vars is None, return all variables from self.data.
141
+ If vars is a string, convert it to a list.
142
+ Check if all variables exist in self.data, if not raise an error and return all available variables.
143
+
144
+ Args:
145
+ vars: List of variable names, single variable name, or None.
146
+
147
+ Returns:
148
+ List[str]: List of selected variable names.
149
+
150
+ Raises:
151
+ ValueError: If specified variables don't exist in the dataset.
152
+ """
153
+ # Get all available variables from the data
154
+ all_vars = list(self.df.columns)
155
+
156
+ if vars is None:
157
+ return all_vars
158
+
159
+ # Convert string to list if needed
160
+ if isinstance(vars, str):
161
+ vars = [vars]
162
+
163
+ # Check if all specified variables exist in the dataset
164
+ missing_vars = [var for var in vars if var not in all_vars]
165
+
166
+ if missing_vars:
167
+ raise ValueError(f"Variables {missing_vars} not found in dataset. "
168
+ f"Available variables are: {all_vars}")
169
+
170
+ return vars
171
+
172
+ # Helper methods for summary
173
+ @staticmethod
174
+ def _get_variable_info(var_series: pd.Series) -> Dict[str, Any]:
175
+ """
176
+ Get detailed information for a single variable.
177
+
178
+ Args:
179
+ var_series: pandas Series containing the variable data
180
+
181
+ Returns:
182
+ Dict[str, Any]: Variable information including type, observations, and summary statistics
183
+ """
184
+ # Remove NA values for analysis
185
+ non_na_series = var_series.dropna()
186
+ non_na_count = len(non_na_series)
187
+
188
+ # Determine variable type
189
+ var_type = DataInfoBase._determine_variable_type(non_na_series)
190
+
191
+ # Basic variable info
192
+ var_info = {
193
+ "type": var_type,
194
+ "obs": non_na_count
195
+ }
196
+
197
+ # Add type-specific information
198
+ if var_type == "str":
199
+ var_info["value_list"] = DataInfoBase._get_string_value_list(non_na_series)
200
+ else: # float type
201
+ var_info["summary"] = DataInfoBase._get_numeric_summary(non_na_series)
202
+
203
+ return var_info
204
+
205
+ @staticmethod
206
+ def _determine_variable_type(series: pd.Series) -> str:
207
+ """
208
+ Determine the type of a variable.
209
+
210
+ Args:
211
+ series: pandas Series with NA values removed
212
+
213
+ Returns:
214
+ str: "str" for string variables, "float" for numeric variables
215
+ """
216
+ if len(series) == 0:
217
+ return "float" # Default to float for empty series
218
+
219
+ # Check if all non-null values are numeric
220
+ try:
221
+ # Try to convert to numeric
222
+ pd.to_numeric(series, errors='raise')
223
+ return "float"
224
+ except (ValueError, TypeError):
225
+ return "str"
226
+
227
+ @staticmethod
228
+ def _get_string_value_list(series: pd.Series) -> List[str]:
229
+ """
230
+ Get a list of unique string values (up to 10 random values).
231
+
232
+ Args:
233
+ series: pandas Series with NA values removed
234
+
235
+ Returns:
236
+ List[str]: List of up to 10 unique string values
237
+ """
238
+ unique_values = series.unique()
239
+
240
+ if len(unique_values) <= 10:
241
+ return sorted(unique_values.tolist())
242
+ else:
243
+ # Randomly sample 10 values if there are more than 10
244
+ import random
245
+ sampled_values = random.sample(unique_values.tolist(), 10)
246
+ return sorted(sampled_values)
247
+
248
+ @staticmethod
249
+ def _get_numeric_summary(series: pd.Series) -> Dict[str, float]:
250
+ """
251
+ Calculate summary statistics for numeric variables.
252
+
253
+ Args:
254
+ series: pandas Series with NA values removed
255
+
256
+ Returns:
257
+ Dict[str, float]: Summary statistics including mean, se, min, max
258
+ """
259
+ if len(series) == 0:
260
+ return {
261
+ "mean": np.nan,
262
+ "se": np.nan,
263
+ "min": np.nan,
264
+ "max": np.nan
265
+ }
266
+
267
+ # Convert to numeric to handle any remaining type issues
268
+ numeric_series = pd.to_numeric(series, errors='coerce').dropna()
269
+
270
+ if len(numeric_series) == 0:
271
+ return {
272
+ "mean": np.nan,
273
+ "se": np.nan,
274
+ "min": np.nan,
275
+ "max": np.nan
276
+ }
277
+
278
+ mean_val = float(numeric_series.mean())
279
+ std_val = float(numeric_series.std())
280
+ n = len(numeric_series)
281
+ se_val = std_val / np.sqrt(n) if n > 0 else np.nan
282
+
283
+ return {
284
+ "mean": mean_val,
285
+ "se": se_val,
286
+ "min": float(numeric_series.min()),
287
+ "max": float(numeric_series.max())
288
+ }
@@ -0,0 +1,123 @@
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ import pandas as pd
9
+
10
+ from ._base import DataInfoBase
11
+
12
+
13
+ class CsvDataInfo(DataInfoBase):
14
+
15
+
16
+
17
+ def __init__(self,
18
+ data_path: str | Path,
19
+ vars_list: List[str] | str = None,
20
+ *,
21
+ encoding: str = "utf-8",
22
+ cache_info: bool = True,
23
+ cache_dir: str | Path = None,
24
+ **kwargs):
25
+ """
26
+ Initialize CSV data info handler.
27
+
28
+ Args:
29
+ data_path: Path to the CSV file
30
+ vars_list: List of variables to analyze, or single variable name
31
+ encoding: File encoding (default: utf-8)
32
+ cache_info: Whether to cache data information (default: True)
33
+ cache_dir: Directory for caching (default: None)
34
+ **kwargs: Additional pandas.read_csv() arguments (sep, header, etc.)
35
+ """
36
+ # Initialize base class with kwargs
37
+ super().__init__(
38
+ data_path=data_path,
39
+ vars_list=vars_list,
40
+ encoding=encoding,
41
+ cache_info=cache_info,
42
+ cache_dir=cache_dir,
43
+ **kwargs
44
+ )
45
+
46
+ def _read_data(self) -> pd.DataFrame:
47
+ """
48
+ Read CSV file into pandas DataFrame.
49
+
50
+ Automatically detects header and handles various CSV formats.
51
+
52
+ Returns:
53
+ pd.DataFrame: The data from the CSV file
54
+
55
+ Raises:
56
+ FileNotFoundError: If the file does not exist
57
+ ValueError: If the file is not a valid CSV file
58
+ """
59
+ # Convert to Path object if it's a string
60
+ file_path = Path(self.data_path)
61
+
62
+ # Check if file exists
63
+ if not file_path.exists():
64
+ raise FileNotFoundError(f"CSV file not found: {file_path}")
65
+
66
+ # Check if it's a CSV file
67
+ valid_extensions = {'.csv', '.txt', '.tsv'}
68
+ if file_path.suffix.lower() not in valid_extensions:
69
+ raise ValueError(f"File must have extension in {valid_extensions}, got: {file_path.suffix}")
70
+
71
+ try:
72
+ # Auto-detect header if not explicitly specified
73
+ if 'header' not in self.kwargs:
74
+ # Read first few lines to detect header
75
+ sample_kwargs = {k: v for k, v in self.kwargs.items() if k not in ['header', 'names']}
76
+
77
+ # Try reading with header=0 (assume first row is header)
78
+ try:
79
+ df_with_header = pd.read_csv(file_path, nrows=10, header=0, **sample_kwargs)
80
+
81
+ # Simple heuristic: check if column names look like data values
82
+ # If column names are all numeric or look like data, probably no header
83
+ column_names = df_with_header.columns.tolist()
84
+
85
+ # Check if any column name looks like a data value (numeric)
86
+ looks_like_data = False
87
+ for col_name in column_names:
88
+ # Try to convert column name to float
89
+ try:
90
+ float(str(col_name))
91
+ looks_like_data = True
92
+ break
93
+ except (ValueError, TypeError):
94
+ continue
95
+
96
+ if looks_like_data:
97
+ # Column names look like data values, so no header
98
+ self.kwargs['header'] = None
99
+ else:
100
+ # Column names don't look like data, assume header exists
101
+ self.kwargs['header'] = 0
102
+
103
+ except Exception:
104
+ # If detection fails, default to header=0
105
+ self.kwargs['header'] = 0
106
+
107
+ # Handle no-header case by providing default column names
108
+ if self.kwargs.get('header') is None:
109
+ # First, read a sample to determine number of columns
110
+ sample_kwargs = {k: v for k, v in self.kwargs.items() if k not in ['header', 'names']}
111
+ sample_df = pd.read_csv(file_path, nrows=1, header=None, **sample_kwargs)
112
+ num_cols = len(sample_df.columns)
113
+
114
+ # Generate default column names
115
+ self.kwargs['names'] = [f'V{i+1}' for i in range(num_cols)]
116
+
117
+ # Read the CSV file
118
+ df = pd.read_csv(file_path, **self.kwargs)
119
+
120
+ return df
121
+
122
+ except Exception as e:
123
+ raise ValueError(f"Error reading CSV file {file_path}: {str(e)}")
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ from io import BytesIO
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+ import requests
10
+
11
+ from ._base import DataInfoBase
12
+
13
+
14
+ class DtaDataInfo(DataInfoBase):
15
+
16
+
17
+
18
+ def _read_data(self) -> pd.DataFrame:
19
+ """
20
+ Read Stata dta file into pandas DataFrame.
21
+
22
+ Returns:
23
+ pd.DataFrame: The data from the Stata file
24
+
25
+ Raises:
26
+ FileNotFoundError: If the file does not exist
27
+ ValueError: If the file is not a valid Stata file
28
+ """
29
+ # Check if it's a URL first
30
+ if self.is_url:
31
+ # For URLs, validate the file extension from the URL string
32
+ from urllib.parse import urlparse
33
+ parsed_url = urlparse(str(self.data_path))
34
+ url_path = parsed_url.path
35
+ if not url_path.lower().endswith('.dta'):
36
+ raise ValueError(f"URL must point to a .dta file, got: {url_path}")
37
+ file_path = None # Not used for URLs
38
+ else:
39
+ # For local files, convert to Path object and validate
40
+ file_path = Path(self.data_path)
41
+
42
+ # Check if file exists
43
+ if not file_path.exists():
44
+ raise FileNotFoundError(f"Stata file not found: {file_path}")
45
+
46
+ # Check if it's a .dta file
47
+ if file_path.suffix.lower() != '.dta':
48
+ raise ValueError(f"File must have .dta extension, got: {file_path.suffix}")
49
+
50
+ try:
51
+ # Read the Stata file
52
+ # Using read_stata with convert_categoricals=False to avoid converting labels to categories
53
+ # This preserves the original data structure without converting value labels
54
+ buffer = None
55
+ if self.is_url:
56
+ resp = requests.get(self.data_path)
57
+ resp.raise_for_status()
58
+ buffer = BytesIO(resp.content)
59
+
60
+ df = pd.read_stata(
61
+ buffer or file_path,
62
+ convert_categoricals=False, # disable change data to mapped str.
63
+ convert_dates=True,
64
+ convert_missing=False,
65
+ preserve_dtypes=True
66
+ )
67
+ return df
68
+
69
+ except Exception as e:
70
+ raise ValueError(f"Error reading Stata file {self.data_path}: {str(e)}")
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ from .stata_controller import StataController
6
+ from .stata_do import StataDo
7
+ from .stata_finder import StataFinder
8
+
9
+ __all__ = [
10
+ "StataFinder",
11
+ "StataController",
12
+ "StataDo"
13
+ ]
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ from .controller import StataController
6
+
7
+ __all__ = [
8
+ "StataController",
9
+ ]