aigroup-stata-mcp 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aigroup_stata_mcp-1.0.3.dist-info/METADATA +345 -0
- aigroup_stata_mcp-1.0.3.dist-info/RECORD +38 -0
- aigroup_stata_mcp-1.0.3.dist-info/WHEEL +4 -0
- aigroup_stata_mcp-1.0.3.dist-info/entry_points.txt +5 -0
- aigroup_stata_mcp-1.0.3.dist-info/licenses/LICENSE +21 -0
- stata_mcp/__init__.py +18 -0
- stata_mcp/cli/__init__.py +8 -0
- stata_mcp/cli/_cli.py +95 -0
- stata_mcp/core/__init__.py +14 -0
- stata_mcp/core/data_info/__init__.py +11 -0
- stata_mcp/core/data_info/_base.py +288 -0
- stata_mcp/core/data_info/csv.py +123 -0
- stata_mcp/core/data_info/dta.py +70 -0
- stata_mcp/core/stata/__init__.py +13 -0
- stata_mcp/core/stata/stata_controller/__init__.py +9 -0
- stata_mcp/core/stata/stata_controller/controller.py +208 -0
- stata_mcp/core/stata/stata_do/__init__.py +9 -0
- stata_mcp/core/stata/stata_do/do.py +177 -0
- stata_mcp/core/stata/stata_finder/__init__.py +9 -0
- stata_mcp/core/stata/stata_finder/base.py +294 -0
- stata_mcp/core/stata/stata_finder/finder.py +193 -0
- stata_mcp/core/stata/stata_finder/linux.py +43 -0
- stata_mcp/core/stata/stata_finder/macos.py +88 -0
- stata_mcp/core/stata/stata_finder/windows.py +191 -0
- stata_mcp/server/__init__.py +8 -0
- stata_mcp/server/main.py +153 -0
- stata_mcp/server/prompts/__init__.py +8 -0
- stata_mcp/server/prompts/core_prompts.py +122 -0
- stata_mcp/server/tools/__init__.py +10 -0
- stata_mcp/server/tools/core_tools.py +59 -0
- stata_mcp/server/tools/file_tools.py +163 -0
- stata_mcp/server/tools/stata_tools.py +221 -0
- stata_mcp/utils/Installer/__init__.py +7 -0
- stata_mcp/utils/Installer/installer.py +85 -0
- stata_mcp/utils/Prompt/__init__.py +74 -0
- stata_mcp/utils/Prompt/string.py +91 -0
- stata_mcp/utils/__init__.py +23 -0
- stata_mcp/utils/usable.py +244 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
#!/usr/bin/python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DataInfoBase(ABC):
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def __init__(self,
|
|
19
|
+
data_path: str | Path,
|
|
20
|
+
vars_list: List[str] | str = None,
|
|
21
|
+
*,
|
|
22
|
+
encoding: str = "utf-8",
|
|
23
|
+
cache_info: bool = True,
|
|
24
|
+
cache_dir: str | Path = None,
|
|
25
|
+
**kwargs):
|
|
26
|
+
self.data_path = data_path
|
|
27
|
+
self.encoding = encoding
|
|
28
|
+
self._pre_vars_list = vars_list
|
|
29
|
+
self.cache_info = cache_info
|
|
30
|
+
self.cache_dir = Path(cache_dir) if cache_dir else None
|
|
31
|
+
self.kwargs = kwargs # Store additional keyword arguments for subclasses to use
|
|
32
|
+
|
|
33
|
+
# Properties
|
|
34
|
+
@property
|
|
35
|
+
def df(self) -> pd.DataFrame:
|
|
36
|
+
"""Get the data as a pandas DataFrame."""
|
|
37
|
+
return self._read_data()
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def vars_list(self) -> List[str]:
|
|
41
|
+
"""Get the list of selected variables."""
|
|
42
|
+
return self._get_selected_vars(self._pre_vars_list)
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def info(self) -> Dict[str, Any]:
|
|
46
|
+
"""Get comprehensive information about the data."""
|
|
47
|
+
return {
|
|
48
|
+
"summary": self.summary(),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def is_url(self) -> bool:
|
|
53
|
+
try:
|
|
54
|
+
result = urlparse(str(self.data_path))
|
|
55
|
+
return all([result.scheme, result.netloc])
|
|
56
|
+
except Exception:
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
# Abstract methods (must be implemented by subclasses)
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def _read_data(self) -> pd.DataFrame:
|
|
62
|
+
"""Read data from the source file. Must be implemented by subclasses."""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
# Public methods
|
|
66
|
+
def summary(self) -> Dict[str, Any]:
|
|
67
|
+
"""
|
|
68
|
+
Provide a summary of the data.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Dict[str, Any]: the summary of provided data (vars)
|
|
72
|
+
|
|
73
|
+
Examples:
|
|
74
|
+
>>> from stata_mcp.core.data_info import DtaDataInfo
|
|
75
|
+
>>> data_info = DtaDataInfo(...)
|
|
76
|
+
>>> summary_data = data_info.summary()
|
|
77
|
+
>>> print(summary_data)
|
|
78
|
+
{
|
|
79
|
+
"overview": {
|
|
80
|
+
"obs": 1314, # Observed numbers
|
|
81
|
+
"var_numbers": 10 # equal to the length of `vars_detail`.
|
|
82
|
+
},
|
|
83
|
+
"vars_detail": {
|
|
84
|
+
"name": {
|
|
85
|
+
"type": "str",
|
|
86
|
+
"obs": 1314,
|
|
87
|
+
"value_list": ["Jack", "Rose", ...] # list 10 random unique value
|
|
88
|
+
},
|
|
89
|
+
"age": {
|
|
90
|
+
"type": "float", # it signed as float no matter the value type is int or float
|
|
91
|
+
"obs": 1314,
|
|
92
|
+
"summary": {
|
|
93
|
+
"mean": 52.1,
|
|
94
|
+
"se": 10.3386,
|
|
95
|
+
"min": 18,
|
|
96
|
+
"max": 100
|
|
97
|
+
}
|
|
98
|
+
},
|
|
99
|
+
"male": {
|
|
100
|
+
"type": "float", # Note: no bool type! It is signed with 0 and 1.
|
|
101
|
+
"obs": 1111, # Note: maybe some obs do not have value (NA), this is not be counted.
|
|
102
|
+
"summary": {
|
|
103
|
+
"mean": 0.49955,
|
|
104
|
+
"se": 0.500225,
|
|
105
|
+
"min": 0,
|
|
106
|
+
"max": 1
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
"var_name": {}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
"""
|
|
113
|
+
df = self.df
|
|
114
|
+
selected_vars = self.vars_list
|
|
115
|
+
|
|
116
|
+
# 基本概览信息
|
|
117
|
+
overview = {
|
|
118
|
+
"obs": len(df),
|
|
119
|
+
"var_numbers": len(selected_vars)
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
# 详细变量信息
|
|
123
|
+
vars_detail = {}
|
|
124
|
+
|
|
125
|
+
for var_name in selected_vars:
|
|
126
|
+
var_series = df[var_name]
|
|
127
|
+
var_info = DataInfoBase._get_variable_info(var_series)
|
|
128
|
+
vars_detail[var_name] = var_info
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
"overview": overview,
|
|
132
|
+
"vars_detail": vars_detail
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
# Private helper methods
|
|
136
|
+
def _get_selected_vars(self, vars: List[str] | str = None) -> List[str]:
|
|
137
|
+
"""
|
|
138
|
+
Get the list of selected variables.
|
|
139
|
+
|
|
140
|
+
If vars is None, return all variables from self.data.
|
|
141
|
+
If vars is a string, convert it to a list.
|
|
142
|
+
Check if all variables exist in self.data, if not raise an error and return all available variables.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
vars: List of variable names, single variable name, or None.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
List[str]: List of selected variable names.
|
|
149
|
+
|
|
150
|
+
Raises:
|
|
151
|
+
ValueError: If specified variables don't exist in the dataset.
|
|
152
|
+
"""
|
|
153
|
+
# Get all available variables from the data
|
|
154
|
+
all_vars = list(self.df.columns)
|
|
155
|
+
|
|
156
|
+
if vars is None:
|
|
157
|
+
return all_vars
|
|
158
|
+
|
|
159
|
+
# Convert string to list if needed
|
|
160
|
+
if isinstance(vars, str):
|
|
161
|
+
vars = [vars]
|
|
162
|
+
|
|
163
|
+
# Check if all specified variables exist in the dataset
|
|
164
|
+
missing_vars = [var for var in vars if var not in all_vars]
|
|
165
|
+
|
|
166
|
+
if missing_vars:
|
|
167
|
+
raise ValueError(f"Variables {missing_vars} not found in dataset. "
|
|
168
|
+
f"Available variables are: {all_vars}")
|
|
169
|
+
|
|
170
|
+
return vars
|
|
171
|
+
|
|
172
|
+
# Helper methods for summary
|
|
173
|
+
@staticmethod
|
|
174
|
+
def _get_variable_info(var_series: pd.Series) -> Dict[str, Any]:
|
|
175
|
+
"""
|
|
176
|
+
Get detailed information for a single variable.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
var_series: pandas Series containing the variable data
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Dict[str, Any]: Variable information including type, observations, and summary statistics
|
|
183
|
+
"""
|
|
184
|
+
# Remove NA values for analysis
|
|
185
|
+
non_na_series = var_series.dropna()
|
|
186
|
+
non_na_count = len(non_na_series)
|
|
187
|
+
|
|
188
|
+
# Determine variable type
|
|
189
|
+
var_type = DataInfoBase._determine_variable_type(non_na_series)
|
|
190
|
+
|
|
191
|
+
# Basic variable info
|
|
192
|
+
var_info = {
|
|
193
|
+
"type": var_type,
|
|
194
|
+
"obs": non_na_count
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
# Add type-specific information
|
|
198
|
+
if var_type == "str":
|
|
199
|
+
var_info["value_list"] = DataInfoBase._get_string_value_list(non_na_series)
|
|
200
|
+
else: # float type
|
|
201
|
+
var_info["summary"] = DataInfoBase._get_numeric_summary(non_na_series)
|
|
202
|
+
|
|
203
|
+
return var_info
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def _determine_variable_type(series: pd.Series) -> str:
|
|
207
|
+
"""
|
|
208
|
+
Determine the type of a variable.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
series: pandas Series with NA values removed
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
str: "str" for string variables, "float" for numeric variables
|
|
215
|
+
"""
|
|
216
|
+
if len(series) == 0:
|
|
217
|
+
return "float" # Default to float for empty series
|
|
218
|
+
|
|
219
|
+
# Check if all non-null values are numeric
|
|
220
|
+
try:
|
|
221
|
+
# Try to convert to numeric
|
|
222
|
+
pd.to_numeric(series, errors='raise')
|
|
223
|
+
return "float"
|
|
224
|
+
except (ValueError, TypeError):
|
|
225
|
+
return "str"
|
|
226
|
+
|
|
227
|
+
@staticmethod
|
|
228
|
+
def _get_string_value_list(series: pd.Series) -> List[str]:
|
|
229
|
+
"""
|
|
230
|
+
Get a list of unique string values (up to 10 random values).
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
series: pandas Series with NA values removed
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
List[str]: List of up to 10 unique string values
|
|
237
|
+
"""
|
|
238
|
+
unique_values = series.unique()
|
|
239
|
+
|
|
240
|
+
if len(unique_values) <= 10:
|
|
241
|
+
return sorted(unique_values.tolist())
|
|
242
|
+
else:
|
|
243
|
+
# Randomly sample 10 values if there are more than 10
|
|
244
|
+
import random
|
|
245
|
+
sampled_values = random.sample(unique_values.tolist(), 10)
|
|
246
|
+
return sorted(sampled_values)
|
|
247
|
+
|
|
248
|
+
@staticmethod
|
|
249
|
+
def _get_numeric_summary(series: pd.Series) -> Dict[str, float]:
|
|
250
|
+
"""
|
|
251
|
+
Calculate summary statistics for numeric variables.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
series: pandas Series with NA values removed
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Dict[str, float]: Summary statistics including mean, se, min, max
|
|
258
|
+
"""
|
|
259
|
+
if len(series) == 0:
|
|
260
|
+
return {
|
|
261
|
+
"mean": np.nan,
|
|
262
|
+
"se": np.nan,
|
|
263
|
+
"min": np.nan,
|
|
264
|
+
"max": np.nan
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
# Convert to numeric to handle any remaining type issues
|
|
268
|
+
numeric_series = pd.to_numeric(series, errors='coerce').dropna()
|
|
269
|
+
|
|
270
|
+
if len(numeric_series) == 0:
|
|
271
|
+
return {
|
|
272
|
+
"mean": np.nan,
|
|
273
|
+
"se": np.nan,
|
|
274
|
+
"min": np.nan,
|
|
275
|
+
"max": np.nan
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
mean_val = float(numeric_series.mean())
|
|
279
|
+
std_val = float(numeric_series.std())
|
|
280
|
+
n = len(numeric_series)
|
|
281
|
+
se_val = std_val / np.sqrt(n) if n > 0 else np.nan
|
|
282
|
+
|
|
283
|
+
return {
|
|
284
|
+
"mean": mean_val,
|
|
285
|
+
"se": se_val,
|
|
286
|
+
"min": float(numeric_series.min()),
|
|
287
|
+
"max": float(numeric_series.max())
|
|
288
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#!/usr/bin/python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from ._base import DataInfoBase
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CsvDataInfo(DataInfoBase):
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def __init__(self,
|
|
18
|
+
data_path: str | Path,
|
|
19
|
+
vars_list: List[str] | str = None,
|
|
20
|
+
*,
|
|
21
|
+
encoding: str = "utf-8",
|
|
22
|
+
cache_info: bool = True,
|
|
23
|
+
cache_dir: str | Path = None,
|
|
24
|
+
**kwargs):
|
|
25
|
+
"""
|
|
26
|
+
Initialize CSV data info handler.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
data_path: Path to the CSV file
|
|
30
|
+
vars_list: List of variables to analyze, or single variable name
|
|
31
|
+
encoding: File encoding (default: utf-8)
|
|
32
|
+
cache_info: Whether to cache data information (default: True)
|
|
33
|
+
cache_dir: Directory for caching (default: None)
|
|
34
|
+
**kwargs: Additional pandas.read_csv() arguments (sep, header, etc.)
|
|
35
|
+
"""
|
|
36
|
+
# Initialize base class with kwargs
|
|
37
|
+
super().__init__(
|
|
38
|
+
data_path=data_path,
|
|
39
|
+
vars_list=vars_list,
|
|
40
|
+
encoding=encoding,
|
|
41
|
+
cache_info=cache_info,
|
|
42
|
+
cache_dir=cache_dir,
|
|
43
|
+
**kwargs
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def _read_data(self) -> pd.DataFrame:
|
|
47
|
+
"""
|
|
48
|
+
Read CSV file into pandas DataFrame.
|
|
49
|
+
|
|
50
|
+
Automatically detects header and handles various CSV formats.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
pd.DataFrame: The data from the CSV file
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
FileNotFoundError: If the file does not exist
|
|
57
|
+
ValueError: If the file is not a valid CSV file
|
|
58
|
+
"""
|
|
59
|
+
# Convert to Path object if it's a string
|
|
60
|
+
file_path = Path(self.data_path)
|
|
61
|
+
|
|
62
|
+
# Check if file exists
|
|
63
|
+
if not file_path.exists():
|
|
64
|
+
raise FileNotFoundError(f"CSV file not found: {file_path}")
|
|
65
|
+
|
|
66
|
+
# Check if it's a CSV file
|
|
67
|
+
valid_extensions = {'.csv', '.txt', '.tsv'}
|
|
68
|
+
if file_path.suffix.lower() not in valid_extensions:
|
|
69
|
+
raise ValueError(f"File must have extension in {valid_extensions}, got: {file_path.suffix}")
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
# Auto-detect header if not explicitly specified
|
|
73
|
+
if 'header' not in self.kwargs:
|
|
74
|
+
# Read first few lines to detect header
|
|
75
|
+
sample_kwargs = {k: v for k, v in self.kwargs.items() if k not in ['header', 'names']}
|
|
76
|
+
|
|
77
|
+
# Try reading with header=0 (assume first row is header)
|
|
78
|
+
try:
|
|
79
|
+
df_with_header = pd.read_csv(file_path, nrows=10, header=0, **sample_kwargs)
|
|
80
|
+
|
|
81
|
+
# Simple heuristic: check if column names look like data values
|
|
82
|
+
# If column names are all numeric or look like data, probably no header
|
|
83
|
+
column_names = df_with_header.columns.tolist()
|
|
84
|
+
|
|
85
|
+
# Check if any column name looks like a data value (numeric)
|
|
86
|
+
looks_like_data = False
|
|
87
|
+
for col_name in column_names:
|
|
88
|
+
# Try to convert column name to float
|
|
89
|
+
try:
|
|
90
|
+
float(str(col_name))
|
|
91
|
+
looks_like_data = True
|
|
92
|
+
break
|
|
93
|
+
except (ValueError, TypeError):
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
if looks_like_data:
|
|
97
|
+
# Column names look like data values, so no header
|
|
98
|
+
self.kwargs['header'] = None
|
|
99
|
+
else:
|
|
100
|
+
# Column names don't look like data, assume header exists
|
|
101
|
+
self.kwargs['header'] = 0
|
|
102
|
+
|
|
103
|
+
except Exception:
|
|
104
|
+
# If detection fails, default to header=0
|
|
105
|
+
self.kwargs['header'] = 0
|
|
106
|
+
|
|
107
|
+
# Handle no-header case by providing default column names
|
|
108
|
+
if self.kwargs.get('header') is None:
|
|
109
|
+
# First, read a sample to determine number of columns
|
|
110
|
+
sample_kwargs = {k: v for k, v in self.kwargs.items() if k not in ['header', 'names']}
|
|
111
|
+
sample_df = pd.read_csv(file_path, nrows=1, header=None, **sample_kwargs)
|
|
112
|
+
num_cols = len(sample_df.columns)
|
|
113
|
+
|
|
114
|
+
# Generate default column names
|
|
115
|
+
self.kwargs['names'] = [f'V{i+1}' for i in range(num_cols)]
|
|
116
|
+
|
|
117
|
+
# Read the CSV file
|
|
118
|
+
df = pd.read_csv(file_path, **self.kwargs)
|
|
119
|
+
|
|
120
|
+
return df
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
raise ValueError(f"Error reading CSV file {file_path}: {str(e)}")
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
#!/usr/bin/python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from ._base import DataInfoBase
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DtaDataInfo(DataInfoBase):
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _read_data(self) -> pd.DataFrame:
|
|
19
|
+
"""
|
|
20
|
+
Read Stata dta file into pandas DataFrame.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
pd.DataFrame: The data from the Stata file
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
FileNotFoundError: If the file does not exist
|
|
27
|
+
ValueError: If the file is not a valid Stata file
|
|
28
|
+
"""
|
|
29
|
+
# Check if it's a URL first
|
|
30
|
+
if self.is_url:
|
|
31
|
+
# For URLs, validate the file extension from the URL string
|
|
32
|
+
from urllib.parse import urlparse
|
|
33
|
+
parsed_url = urlparse(str(self.data_path))
|
|
34
|
+
url_path = parsed_url.path
|
|
35
|
+
if not url_path.lower().endswith('.dta'):
|
|
36
|
+
raise ValueError(f"URL must point to a .dta file, got: {url_path}")
|
|
37
|
+
file_path = None # Not used for URLs
|
|
38
|
+
else:
|
|
39
|
+
# For local files, convert to Path object and validate
|
|
40
|
+
file_path = Path(self.data_path)
|
|
41
|
+
|
|
42
|
+
# Check if file exists
|
|
43
|
+
if not file_path.exists():
|
|
44
|
+
raise FileNotFoundError(f"Stata file not found: {file_path}")
|
|
45
|
+
|
|
46
|
+
# Check if it's a .dta file
|
|
47
|
+
if file_path.suffix.lower() != '.dta':
|
|
48
|
+
raise ValueError(f"File must have .dta extension, got: {file_path.suffix}")
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Read the Stata file
|
|
52
|
+
# Using read_stata with convert_categoricals=False to avoid converting labels to categories
|
|
53
|
+
# This preserves the original data structure without converting value labels
|
|
54
|
+
buffer = None
|
|
55
|
+
if self.is_url:
|
|
56
|
+
resp = requests.get(self.data_path)
|
|
57
|
+
resp.raise_for_status()
|
|
58
|
+
buffer = BytesIO(resp.content)
|
|
59
|
+
|
|
60
|
+
df = pd.read_stata(
|
|
61
|
+
buffer or file_path,
|
|
62
|
+
convert_categoricals=False, # disable change data to mapped str.
|
|
63
|
+
convert_dates=True,
|
|
64
|
+
convert_missing=False,
|
|
65
|
+
preserve_dtypes=True
|
|
66
|
+
)
|
|
67
|
+
return df
|
|
68
|
+
|
|
69
|
+
except Exception as e:
|
|
70
|
+
raise ValueError(f"Error reading Stata file {self.data_path}: {str(e)}")
|