saslite 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- saslite-0.1.0/DOCS.md +1551 -0
- saslite-0.1.0/LICENSE +21 -0
- saslite-0.1.0/MANIFEST.in +8 -0
- saslite-0.1.0/PKG-INFO +244 -0
- saslite-0.1.0/README.md +212 -0
- saslite-0.1.0/examples/basic_sql.sas +32 -0
- saslite-0.1.0/examples/hello_data_step.sas +22 -0
- saslite-0.1.0/examples/import_export.sas +27 -0
- saslite-0.1.0/pyproject.toml +58 -0
- saslite-0.1.0/setup.cfg +4 -0
- saslite-0.1.0/src/saslite/__init__.py +7 -0
- saslite-0.1.0/src/saslite/__main__.py +5 -0
- saslite-0.1.0/src/saslite/api/__init__.py +0 -0
- saslite-0.1.0/src/saslite/api/facade.py +208 -0
- saslite-0.1.0/src/saslite/api/results.py +5 -0
- saslite-0.1.0/src/saslite/ast/__init__.py +0 -0
- saslite-0.1.0/src/saslite/ast/base.py +22 -0
- saslite-0.1.0/src/saslite/ast/data_step.py +207 -0
- saslite-0.1.0/src/saslite/ast/expressions.py +105 -0
- saslite-0.1.0/src/saslite/ast/proc.py +92 -0
- saslite-0.1.0/src/saslite/ast/program.py +36 -0
- saslite-0.1.0/src/saslite/ast/sql.py +106 -0
- saslite-0.1.0/src/saslite/cli/__init__.py +0 -0
- saslite-0.1.0/src/saslite/cli/main.py +153 -0
- saslite-0.1.0/src/saslite/diagnostics/__init__.py +0 -0
- saslite-0.1.0/src/saslite/diagnostics/errors.py +50 -0
- saslite-0.1.0/src/saslite/diagnostics/reporter.py +56 -0
- saslite-0.1.0/src/saslite/executor/__init__.py +0 -0
- saslite-0.1.0/src/saslite/executor/data_step/__init__.py +0 -0
- saslite-0.1.0/src/saslite/executor/data_step/executor.py +1130 -0
- saslite-0.1.0/src/saslite/executor/dispatcher.py +89 -0
- saslite-0.1.0/src/saslite/executor/expression_eval.py +385 -0
- saslite-0.1.0/src/saslite/executor/libname.py +78 -0
- saslite-0.1.0/src/saslite/executor/proc/__init__.py +0 -0
- saslite-0.1.0/src/saslite/executor/proc/registry.py +880 -0
- saslite-0.1.0/src/saslite/executor/sql/__init__.py +0 -0
- saslite-0.1.0/src/saslite/executor/sql/executor.py +2150 -0
- saslite-0.1.0/src/saslite/functions/__init__.py +100 -0
- saslite-0.1.0/src/saslite/functions/char_funcs.py +325 -0
- saslite-0.1.0/src/saslite/functions/conditional_funcs.py +36 -0
- saslite-0.1.0/src/saslite/functions/convert_funcs.py +207 -0
- saslite-0.1.0/src/saslite/functions/date_funcs.py +308 -0
- saslite-0.1.0/src/saslite/functions/numeric_funcs.py +213 -0
- saslite-0.1.0/src/saslite/functions/registry.py +31 -0
- saslite-0.1.0/src/saslite/macro/__init__.py +0 -0
- saslite-0.1.0/src/saslite/macro/expander.py +547 -0
- saslite-0.1.0/src/saslite/parser/__init__.py +0 -0
- saslite-0.1.0/src/saslite/parser/grammar/saslite.lark +420 -0
- saslite-0.1.0/src/saslite/parser/program_parser.py +46 -0
- saslite-0.1.0/src/saslite/parser/transformer.py +1912 -0
- saslite-0.1.0/src/saslite/planner/__init__.py +0 -0
- saslite-0.1.0/src/saslite/runtime/__init__.py +0 -0
- saslite-0.1.0/src/saslite/runtime/dataset.py +112 -0
- saslite-0.1.0/src/saslite/runtime/execution_result.py +36 -0
- saslite-0.1.0/src/saslite/runtime/formatting.py +68 -0
- saslite-0.1.0/src/saslite/runtime/metadata.py +76 -0
- saslite-0.1.0/src/saslite/runtime/pdv.py +166 -0
- saslite-0.1.0/src/saslite/runtime/types.py +72 -0
- saslite-0.1.0/src/saslite/session/__init__.py +0 -0
- saslite-0.1.0/src/saslite/session/session.py +100 -0
- saslite-0.1.0/src/saslite/source/__init__.py +0 -0
- saslite-0.1.0/src/saslite/storage/__init__.py +0 -0
- saslite-0.1.0/src/saslite/storage/base.py +32 -0
- saslite-0.1.0/src/saslite/storage/csv_backend.py +91 -0
- saslite-0.1.0/src/saslite/storage/memory.py +31 -0
- saslite-0.1.0/src/saslite/storage/path_resolver.py +32 -0
- saslite-0.1.0/src/saslite/storage/sas_backend.py +280 -0
- saslite-0.1.0/src/saslite/testing/__init__.py +0 -0
- saslite-0.1.0/src/saslite.egg-info/PKG-INFO +244 -0
- saslite-0.1.0/src/saslite.egg-info/SOURCES.txt +72 -0
- saslite-0.1.0/src/saslite.egg-info/dependency_links.txt +1 -0
- saslite-0.1.0/src/saslite.egg-info/entry_points.txt +2 -0
- saslite-0.1.0/src/saslite.egg-info/requires.txt +11 -0
- saslite-0.1.0/src/saslite.egg-info/top_level.txt +1 -0
saslite-0.1.0/DOCS.md
ADDED
|
@@ -0,0 +1,1551 @@
|
|
|
1
|
+
# SASLite 使用说明文档
|
|
2
|
+
|
|
3
|
+
> 轻量级本地 SAS 语言解释器 | 基于 Python + Pandas | 版本 0.1.0
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 目录
|
|
8
|
+
|
|
9
|
+
1. [快速开始](#1-快速开始)
|
|
10
|
+
2. [Python API](#2-python-api)
|
|
11
|
+
3. [DATA Step 语法](#3-data-step-语法)
|
|
12
|
+
4. [PROC SQL 语法](#4-proc-sql-语法)
|
|
13
|
+
5. [LIBNAME 库引用](#5-libname--库引用)
|
|
14
|
+
6. [PROC 过程步](#6-proc-过程步)
|
|
15
|
+
7. [宏系统](#7-宏系统)
|
|
16
|
+
8. [内置函数参考](#8-内置函数参考)
|
|
17
|
+
9. [表达式与运算符](#9-表达式与运算符)
|
|
18
|
+
10. [已知限制](#10-已知限制)
|
|
19
|
+
11. [未实现的 SAS 功能](#11-未实现的-sas-功能)
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 1. 快速开始
|
|
24
|
+
|
|
25
|
+
### 安装
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install saslite
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
如需从本地源码目录开发安装:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install -e ".[excel,gui]"
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### 基本用法
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from saslite import SasInterpreter
|
|
41
|
+
|
|
42
|
+
sas = SasInterpreter()
|
|
43
|
+
|
|
44
|
+
# 直接执行 SAS 代码
|
|
45
|
+
result = sas.execute('''
|
|
46
|
+
DATA employees;
|
|
47
|
+
INPUT name $ salary;
|
|
48
|
+
DATALINES;
|
|
49
|
+
Alice 50000
|
|
50
|
+
Bob 60000
|
|
51
|
+
;
|
|
52
|
+
RUN;
|
|
53
|
+
|
|
54
|
+
PROC PRINT DATA=employees; RUN;
|
|
55
|
+
''')
|
|
56
|
+
|
|
57
|
+
print(result.success) # True/False
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### 从 Python DataFrame 创建数据集
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import pandas as pd
|
|
64
|
+
|
|
65
|
+
df = pd.DataFrame({'id': [1,2,3], 'name': ['A','B','C'], 'salary': [50000,60000,55000]})
|
|
66
|
+
sas.create_dataset('employees', df) # 默认存入 WORK 库
|
|
67
|
+
|
|
68
|
+
# 然后就可以在 SAS 代码中引用
|
|
69
|
+
sas.execute('PROC PRINT DATA=employees; RUN;')
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### 导入导出 CSV
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
# Python API 方式
|
|
76
|
+
sas.import_csv('data.csv', 'mydata')
|
|
77
|
+
sas.export_csv('mydata', 'output.csv')
|
|
78
|
+
|
|
79
|
+
# 或在 SAS 代码中
|
|
80
|
+
sas.execute('''
|
|
81
|
+
PROC IMPORT DATAFILE="data.csv" OUT=work.mydata DBMS=CSV; RUN;
|
|
82
|
+
PROC EXPORT DATA=work.mydata OUTFILE="output.csv" DBMS=CSV; RUN;
|
|
83
|
+
''')
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 执行文件
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
result = sas.execute_file('script.sas')
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### 网页 GUI
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
python gui/app.py
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
启动后在浏览器打开 `http://localhost:5000`。网页内置 demo 覆盖基础 DATA Step、PROC SQL、CASE WHEN、LIKE / BETWEEN / IS NULL、LAG / DIF、PROC MEANS、PROC PRINT BY、LIBNAME、宏系统和综合示例。
|
|
99
|
+
|
|
100
|
+
后端接口:
|
|
101
|
+
- `POST /api/execute`:执行 SAS 代码
|
|
102
|
+
- `GET /api/datasets`:列出数据集
|
|
103
|
+
- `GET /api/libraries`:按库列出数据集
|
|
104
|
+
- `GET /api/datasets/<libref>/<name>`:读取数据集内容
|
|
105
|
+
- `DELETE /api/datasets/<libref>/<name>`:删除数据集
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## 2. Python API
|
|
110
|
+
|
|
111
|
+
### SasInterpreter 类
|
|
112
|
+
|
|
113
|
+
| 方法 | 签名 | 说明 |
|
|
114
|
+
|------|------|------|
|
|
115
|
+
| `__init__` | `(work_dir: str = None)` | 创建解释器实例,可选工作目录 |
|
|
116
|
+
| `execute` | `(source: str, source_name: str = "<input>") -> RunSummary` | 执行 SAS 源代码 |
|
|
117
|
+
| `execute_file` | `(path: str) -> RunSummary` | 执行 .sas 文件 |
|
|
118
|
+
| `create_dataset` | `(name: str, df: DataFrame, libref: str = "WORK")` | 从 pandas DataFrame 创建数据集 |
|
|
119
|
+
| `get_dataset` | `(libref: str, name: str) -> DataFrame` | 获取数据集为 pandas DataFrame |
|
|
120
|
+
| `import_csv` | `(filepath: str, dataset_name: str, libref: str = "WORK")` | 导入 CSV 文件 |
|
|
121
|
+
| `export_csv` | `(dataset_name: str, filepath: str, libref: str = "WORK")` | 导出数据集为 CSV |
|
|
122
|
+
|
|
123
|
+
### RunSummary 对象
|
|
124
|
+
|
|
125
|
+
| 属性 | 类型 | 说明 |
|
|
126
|
+
|------|------|------|
|
|
127
|
+
| `success` | `bool` | 全部步骤是否成功 |
|
|
128
|
+
| `steps` | `list[StepResult]` | 每个步骤的结果 |
|
|
129
|
+
| `error` | `str | None` | 错误信息 |
|
|
130
|
+
| `total_steps` | `int` | 执行的步骤数 |
|
|
131
|
+
|
|
132
|
+
### 库引用
|
|
133
|
+
|
|
134
|
+
- `WORK` 是默认的临时库,数据在解释器实例生命周期内保持
|
|
135
|
+
- 支持 `libref.dataset` 语法,如 `WORK.employees`
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## 3. DATA Step 语法
|
|
140
|
+
|
|
141
|
+
### 基本结构
|
|
142
|
+
|
|
143
|
+
```sas
|
|
144
|
+
DATA target_dataset;
|
|
145
|
+
/* statements */
|
|
146
|
+
RUN;
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
`target_dataset` 可以是:
|
|
150
|
+
- 简单名称:`DATA mydata;`(等同于 `WORK.mydata`)
|
|
151
|
+
- 限定名称:`DATA work.mydata;`
|
|
152
|
+
- `_NULL_`:不创建输出数据集
|
|
153
|
+
|
|
154
|
+
### 已实现的 DATA Step 语句
|
|
155
|
+
|
|
156
|
+
#### INPUT + DATALINES — 内联数据输入
|
|
157
|
+
|
|
158
|
+
```sas
|
|
159
|
+
/* 列表模式 — 空格分隔,$ 表示字符变量 */
|
|
160
|
+
DATA employees;
|
|
161
|
+
INPUT name $ salary;
|
|
162
|
+
DATALINES;
|
|
163
|
+
Alice 50000
|
|
164
|
+
Bob 60000
|
|
165
|
+
;
|
|
166
|
+
RUN;
|
|
167
|
+
|
|
168
|
+
/* 多个变量 */
|
|
169
|
+
DATA mixed;
|
|
170
|
+
INPUT id name $ dept $ salary;
|
|
171
|
+
DATALINES;
|
|
172
|
+
1 Alice HR 50000
|
|
173
|
+
2 Bob IT 60000
|
|
174
|
+
;
|
|
175
|
+
RUN;
|
|
176
|
+
|
|
177
|
+
/* 使用 CARDS 关键字(等同于 DATALINES) */
|
|
178
|
+
DATA t;
|
|
179
|
+
INPUT x y;
|
|
180
|
+
CARDS;
|
|
181
|
+
10 20
|
|
182
|
+
30 40
|
|
183
|
+
;
|
|
184
|
+
RUN;
|
|
185
|
+
|
|
186
|
+
/* $ 在末尾(等同于写在变量名前) */
|
|
187
|
+
DATA t;
|
|
188
|
+
INPUT id name $;
|
|
189
|
+
DATALINES;
|
|
190
|
+
1 Alice
|
|
191
|
+
2 Bob
|
|
192
|
+
;
|
|
193
|
+
RUN;
|
|
194
|
+
|
|
195
|
+
/* 单行格式也支持 */
|
|
196
|
+
DATA t;
|
|
197
|
+
INPUT x y;
|
|
198
|
+
DATALINES;
|
|
199
|
+
1 2 3 4
|
|
200
|
+
;
|
|
201
|
+
RUN;
|
|
202
|
+
/* 结果:x=1,y=2 和 x=3,y=4 两行 */
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
支持的关键字:`DATALINES`、`CARDS`、`LINES4`
|
|
206
|
+
|
|
207
|
+
限制:
|
|
208
|
+
- 仅支持列表模式(空格分隔),不支持列模式(`INPUT x 1-10`)或格式化模式(`INPUT x 8.2`)
|
|
209
|
+
- 每个值必须是单个 token(不含空格)
|
|
210
|
+
|
|
211
|
+
#### SET — 读取数据集
|
|
212
|
+
|
|
213
|
+
```sas
|
|
214
|
+
DATA new;
|
|
215
|
+
SET old;
|
|
216
|
+
RUN;
|
|
217
|
+
|
|
218
|
+
/* 读取多个数据集(纵向拼接) */
|
|
219
|
+
DATA combined;
|
|
220
|
+
SET ds1 ds2 ds3;
|
|
221
|
+
RUN;
|
|
222
|
+
|
|
223
|
+
/* 带数据集选项 */
|
|
224
|
+
DATA new;
|
|
225
|
+
SET old(KEEP=name salary);
|
|
226
|
+
RUN;
|
|
227
|
+
|
|
228
|
+
DATA new;
|
|
229
|
+
SET old(DROP=id);
|
|
230
|
+
RUN;
|
|
231
|
+
|
|
232
|
+
DATA new;
|
|
233
|
+
SET old(WHERE=(salary > 50000));
|
|
234
|
+
RUN;
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
#### MERGE — 合并数据集
|
|
238
|
+
|
|
239
|
+
```sas
|
|
240
|
+
/* 一对一合并(按位置) */
|
|
241
|
+
DATA combined;
|
|
242
|
+
MERGE names salaries;
|
|
243
|
+
RUN;
|
|
244
|
+
|
|
245
|
+
/* 按 BY 变量匹配合并 */
|
|
246
|
+
DATA merged;
|
|
247
|
+
MERGE names salaries;
|
|
248
|
+
BY id;
|
|
249
|
+
RUN;
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
#### 赋值语句
|
|
253
|
+
|
|
254
|
+
```sas
|
|
255
|
+
DATA new;
|
|
256
|
+
SET old;
|
|
257
|
+
double_sal = salary * 2;
|
|
258
|
+
name_upper = UPCASE(name);
|
|
259
|
+
label = CATS(name, ' - ', dept);
|
|
260
|
+
RUN;
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
#### IF / THEN / ELSE
|
|
264
|
+
|
|
265
|
+
```sas
|
|
266
|
+
/* 单语句形式 */
|
|
267
|
+
DATA new;
|
|
268
|
+
SET old;
|
|
269
|
+
IF salary > 60000 THEN level = 'Senior';
|
|
270
|
+
ELSE level = 'Junior';
|
|
271
|
+
RUN;
|
|
272
|
+
|
|
273
|
+
/* 块形式 */
|
|
274
|
+
DATA new;
|
|
275
|
+
SET old;
|
|
276
|
+
IF salary > 60000 THEN DO;
|
|
277
|
+
level = 'Senior';
|
|
278
|
+
bonus = salary * 0.1;
|
|
279
|
+
END;
|
|
280
|
+
ELSE DO;
|
|
281
|
+
level = 'Junior';
|
|
282
|
+
bonus = salary * 0.05;
|
|
283
|
+
END;
|
|
284
|
+
RUN;
|
|
285
|
+
|
|
286
|
+
/* 子集化 IF(无 THEN)— 仅保留满足条件的行 */
|
|
287
|
+
DATA high_paid;
|
|
288
|
+
SET old;
|
|
289
|
+
IF salary >= 60000;
|
|
290
|
+
RUN;
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
#### DO 循环
|
|
294
|
+
|
|
295
|
+
```sas
|
|
296
|
+
/* 迭代 DO */
|
|
297
|
+
DATA loop;
|
|
298
|
+
DO i = 1 TO 10;
|
|
299
|
+
x = i ** 2;
|
|
300
|
+
OUTPUT;
|
|
301
|
+
END;
|
|
302
|
+
RUN;
|
|
303
|
+
|
|
304
|
+
/* 带 BY 步长 */
|
|
305
|
+
DATA step;
|
|
306
|
+
DO i = 0 TO 100 BY 10;
|
|
307
|
+
OUTPUT;
|
|
308
|
+
END;
|
|
309
|
+
RUN;
|
|
310
|
+
|
|
311
|
+
/* DO WHILE */
|
|
312
|
+
DATA w;
|
|
313
|
+
i = 1;
|
|
314
|
+
DO WHILE (i <= 5);
|
|
315
|
+
x = i;
|
|
316
|
+
OUTPUT;
|
|
317
|
+
i + 1;
|
|
318
|
+
END;
|
|
319
|
+
RUN;
|
|
320
|
+
|
|
321
|
+
/* DO UNTIL */
|
|
322
|
+
DATA u;
|
|
323
|
+
i = 1;
|
|
324
|
+
DO UNTIL (i > 5);
|
|
325
|
+
x = i;
|
|
326
|
+
OUTPUT;
|
|
327
|
+
i + 1;
|
|
328
|
+
END;
|
|
329
|
+
RUN;
|
|
330
|
+
|
|
331
|
+
/* 简单 DO 块 */
|
|
332
|
+
DATA s;
|
|
333
|
+
DO;
|
|
334
|
+
x = 1;
|
|
335
|
+
y = 2;
|
|
336
|
+
END;
|
|
337
|
+
RUN;
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
#### OUTPUT
|
|
341
|
+
|
|
342
|
+
```sas
|
|
343
|
+
/* 显式 OUTPUT — 手动控制输出时机 */
|
|
344
|
+
DATA new;
|
|
345
|
+
SET old;
|
|
346
|
+
OUTPUT; /* 每行输出 */
|
|
347
|
+
RUN;
|
|
348
|
+
|
|
349
|
+
/* 条件输出 */
|
|
350
|
+
DATA new;
|
|
351
|
+
SET old;
|
|
352
|
+
IF salary > 50000 THEN OUTPUT;
|
|
353
|
+
RUN;
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
> **注意**:如果存在显式 OUTPUT 语句,则关闭隐式输出。这是标准 SAS 行为。
|
|
357
|
+
|
|
358
|
+
#### DELETE / STOP
|
|
359
|
+
|
|
360
|
+
```sas
|
|
361
|
+
/* DELETE — 删除当前观测,不输出 */
|
|
362
|
+
DATA new;
|
|
363
|
+
SET old;
|
|
364
|
+
IF salary < 30000 THEN DELETE;
|
|
365
|
+
RUN;
|
|
366
|
+
|
|
367
|
+
/* STOP — 立即停止 DATA 步 */
|
|
368
|
+
DATA new;
|
|
369
|
+
SET old;
|
|
370
|
+
IF _N_ > 100 THEN STOP;
|
|
371
|
+
RUN;
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
#### RETAIN
|
|
375
|
+
|
|
376
|
+
```sas
|
|
377
|
+
/* RETAIN — 在迭代间保留变量值 */
|
|
378
|
+
DATA cumulative;
|
|
379
|
+
SET sales;
|
|
380
|
+
RETAIN running_total 0; /* 初始值为 0 */
|
|
381
|
+
running_total = running_total + amount;
|
|
382
|
+
RUN;
|
|
383
|
+
|
|
384
|
+
/* 不带初始值(默认为缺失) */
|
|
385
|
+
DATA lagged;
|
|
386
|
+
SET series;
|
|
387
|
+
RETAIN prev_value;
|
|
388
|
+
change = value - prev_value;
|
|
389
|
+
prev_value = value;
|
|
390
|
+
RUN;
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
#### WHERE
|
|
394
|
+
|
|
395
|
+
```sas
|
|
396
|
+
/* WHERE — 在迭代前过滤行 */
|
|
397
|
+
DATA filtered;
|
|
398
|
+
SET all_data;
|
|
399
|
+
WHERE salary >= 50000;
|
|
400
|
+
RUN;
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
#### LAG / DIF — 滞后与差分函数
|
|
404
|
+
|
|
405
|
+
```sas
|
|
406
|
+
/* LAG — 返回前 n 个观测值 */
|
|
407
|
+
DATA lagged;
|
|
408
|
+
SET series;
|
|
409
|
+
prev = LAG(value); /* 前 1 个值 */
|
|
410
|
+
prev2 = LAG2(value); /* 前 2 个值 */
|
|
411
|
+
RUN;
|
|
412
|
+
|
|
413
|
+
/* DIF — 当前值与前 n 个值之差 */
|
|
414
|
+
DATA diffs;
|
|
415
|
+
SET series;
|
|
416
|
+
change = DIF(value); /* value - LAG(value) */
|
|
417
|
+
change2 = DIF2(value); /* value - LAG2(value) */
|
|
418
|
+
RUN;
|
|
419
|
+
|
|
420
|
+
/* 配合 RETAIN 做累计计算 */
|
|
421
|
+
DATA cumulative;
|
|
422
|
+
SET sales;
|
|
423
|
+
RETAIN running_total 0;
|
|
424
|
+
running_total = running_total + amount;
|
|
425
|
+
lag_amount = LAG(amount);
|
|
426
|
+
RUN;
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
支持的函数:`LAG` / `LAG2` ~ `LAG9`、`DIF` / `DIF2` ~ `DIF9`
|
|
430
|
+
|
|
431
|
+
#### _N_ — 自动变量
|
|
432
|
+
|
|
433
|
+
`_N_` 是 DATA step 自动变量,表示当前迭代次数(从 1 开始)。
|
|
434
|
+
|
|
435
|
+
```sas
|
|
436
|
+
/* 仅处理前 100 行 */
|
|
437
|
+
DATA subset;
|
|
438
|
+
SET big_data;
|
|
439
|
+
IF _N_ > 100 THEN STOP;
|
|
440
|
+
RUN;
|
|
441
|
+
|
|
442
|
+
/* 添加行号 */
|
|
443
|
+
DATA numbered;
|
|
444
|
+
SET raw;
|
|
445
|
+
row_id = _N_;
|
|
446
|
+
RUN;
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
#### KEEP / DROP
|
|
450
|
+
|
|
451
|
+
```sas
|
|
452
|
+
DATA new;
|
|
453
|
+
SET old;
|
|
454
|
+
KEEP name salary;
|
|
455
|
+
RUN;
|
|
456
|
+
|
|
457
|
+
DATA new;
|
|
458
|
+
SET old;
|
|
459
|
+
DROP temp_var scratch;
|
|
460
|
+
RUN;
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
#### RENAME
|
|
464
|
+
|
|
465
|
+
```sas
|
|
466
|
+
DATA new;
|
|
467
|
+
SET old;
|
|
468
|
+
RENAME salary = annual_salary name = employee_name;
|
|
469
|
+
RUN;
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
#### FORMAT
|
|
473
|
+
|
|
474
|
+
```sas
|
|
475
|
+
DATA new;
|
|
476
|
+
SET old;
|
|
477
|
+
FORMAT salary DOLLAR10. date DATE9. ratio 8.2;
|
|
478
|
+
RUN;
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
支持的格式说明符格式:`FORMATNAMEw.d`(如 `DOLLAR10.`、`COMMA12.2`)
|
|
482
|
+
|
|
483
|
+
#### LABEL
|
|
484
|
+
|
|
485
|
+
```sas
|
|
486
|
+
DATA new;
|
|
487
|
+
SET old;
|
|
488
|
+
LABEL name = 'Employee Name'
|
|
489
|
+
salary = 'Annual Salary';
|
|
490
|
+
RUN;
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
#### ARRAY
|
|
494
|
+
|
|
495
|
+
```sas
|
|
496
|
+
/* 定义数组引用已有变量 */
|
|
497
|
+
DATA new;
|
|
498
|
+
ARRAY scores[3] math science english;
|
|
499
|
+
SET students;
|
|
500
|
+
avg = MEAN(of scores[*]);
|
|
501
|
+
RUN;
|
|
502
|
+
|
|
503
|
+
/* arr[i] 下标访问(SAS 1-based indexing) */
|
|
504
|
+
DATA new;
|
|
505
|
+
ARRAY vals[3] a b c;
|
|
506
|
+
DO i = 1 TO 3;
|
|
507
|
+
vals[i] = i * 10; /* a=10; b=20; c=30 */
|
|
508
|
+
END;
|
|
509
|
+
RUN;
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
#### FIRST. / LAST. — BY 组首尾标记
|
|
513
|
+
|
|
514
|
+
```sas
|
|
515
|
+
/* 配合 BY 语句,自动生成 FIRST.varname / LAST.varname 布尔标记 */
|
|
516
|
+
PROC SORT DATA=scores OUT=sorted;
|
|
517
|
+
BY group;
|
|
518
|
+
RUN;
|
|
519
|
+
|
|
520
|
+
DATA result;
|
|
521
|
+
SET sorted;
|
|
522
|
+
BY group;
|
|
523
|
+
is_first = FIRST.group; /* 该 BY 组第一条为 1(True) */
|
|
524
|
+
is_last = LAST.group; /* 该 BY 组最后一条为 1(True) */
|
|
525
|
+
RUN;
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
#### SUBSTR 赋值(左值)
|
|
529
|
+
|
|
530
|
+
```sas
|
|
531
|
+
/* SUBSTR(target, start, length) = value — 原地替换字符串片段 */
|
|
532
|
+
DATA result;
|
|
533
|
+
x = 'ABCDEFGH';
|
|
534
|
+
SUBSTR(x, 3, 2) = 'XY'; /* 结果: ABXYEFGH */
|
|
535
|
+
RUN;
|
|
536
|
+
```
|
|
537
|
+
|
|
538
|
+
#### PUT 语句
|
|
539
|
+
|
|
540
|
+
```sas
|
|
541
|
+
/* PUT — 输出到日志 */
|
|
542
|
+
DATA _NULL_;
|
|
543
|
+
x = 42;
|
|
544
|
+
PUT x; /* 输出变量值 */
|
|
545
|
+
PUT 'Hello World'; /* 输出字符串字面量 */
|
|
546
|
+
RUN;
|
|
547
|
+
```
|
|
548
|
+
|
|
549
|
+
#### CALL SYMPUT — 动态创建宏变量
|
|
550
|
+
|
|
551
|
+
```sas
|
|
552
|
+
/* CALL SYMPUT(name, value) — 在 DATA step 中设置宏变量
|
|
553
|
+
注意:宏变量值在 DATA step 执行完毕后才能用 & 引用 */
|
|
554
|
+
DATA _NULL_;
|
|
555
|
+
CALL SYMPUT('threshold', '55000');
|
|
556
|
+
RUN;
|
|
557
|
+
/* 后续代码可用 &threshold 引用 */
|
|
558
|
+
```
|
|
559
|
+
|
|
560
|
+
#### LENGTH / ATTRIB — 变量属性设置
|
|
561
|
+
|
|
562
|
+
```sas
|
|
563
|
+
/* LENGTH — 设置变量存储长度(字符型需 $ 前缀) */
|
|
564
|
+
DATA result;
|
|
565
|
+
LENGTH NAME $ 20 AGE 4;
|
|
566
|
+
NAME = 'Alice';
|
|
567
|
+
AGE = 30;
|
|
568
|
+
RUN;
|
|
569
|
+
|
|
570
|
+
/* ATTRIB — 一次性设置 FORMAT / LABEL / LENGTH 等多个属性 */
|
|
571
|
+
DATA result;
|
|
572
|
+
ATTRIB NAME FORMAT=$10. LABEL='Person Name' LENGTH=20;
|
|
573
|
+
NAME = 'Alice';
|
|
574
|
+
RUN;
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
#### UPDATE — DATA step 更新
|
|
578
|
+
|
|
579
|
+
```sas
|
|
580
|
+
/* UPDATE — 按 BY key 用 transaction 数据覆盖 master 对应值 */
|
|
581
|
+
DATA result;
|
|
582
|
+
UPDATE master transaction;
|
|
583
|
+
BY id;
|
|
584
|
+
RUN;
|
|
585
|
+
```
|
|
586
|
+
|
|
587
|
+
### 未实现的 DATA Step 功能
|
|
588
|
+
|
|
589
|
+
| 功能 | 状态 |
|
|
590
|
+
|------|------|
|
|
591
|
+
| INPUT 列模式(`x 1-10`)/ 格式化模式(`x 8.2`) | 未实现(仅列表模式) |
|
|
592
|
+
| `CALL` 子程序(除 SYMPUTX 等) | 部分实现(SYMPUT 已实现) |
|
|
593
|
+
| 双 SET(并行读取) | 部分支持(多 SET 纵向拼接) |
|
|
594
|
+
| `INFORMAT` / `FORMAT` 持久化(仅元数据存储) | 格式不应用到数据 |
|
|
595
|
+
|
|
596
|
+
---
|
|
597
|
+
|
|
598
|
+
## 4. PROC SQL 语法
|
|
599
|
+
|
|
600
|
+
### 基本结构
|
|
601
|
+
|
|
602
|
+
```sas
|
|
603
|
+
PROC SQL;
|
|
604
|
+
/* SQL statements */
|
|
605
|
+
QUIT;
|
|
606
|
+
```
|
|
607
|
+
|
|
608
|
+
### SELECT
|
|
609
|
+
|
|
610
|
+
```sas
|
|
611
|
+
/* 基本查询 */
|
|
612
|
+
PROC SQL;
|
|
613
|
+
SELECT * FROM employees;
|
|
614
|
+
QUIT;
|
|
615
|
+
|
|
616
|
+
/* 选择特定列 */
|
|
617
|
+
PROC SQL;
|
|
618
|
+
SELECT name, salary FROM employees;
|
|
619
|
+
QUIT;
|
|
620
|
+
|
|
621
|
+
/* 带别名 */
|
|
622
|
+
PROC SQL;
|
|
623
|
+
SELECT name AS employee_name, salary * 1.1 AS new_salary
|
|
624
|
+
FROM employees;
|
|
625
|
+
QUIT;
|
|
626
|
+
|
|
627
|
+
/* DISTINCT */
|
|
628
|
+
PROC SQL;
|
|
629
|
+
SELECT DISTINCT dept_id FROM employees;
|
|
630
|
+
QUIT;
|
|
631
|
+
|
|
632
|
+
/* WHERE 过滤 */
|
|
633
|
+
PROC SQL;
|
|
634
|
+
SELECT name, salary
|
|
635
|
+
FROM employees
|
|
636
|
+
WHERE salary > 55000 AND dept_id = 10;
|
|
637
|
+
QUIT;
|
|
638
|
+
|
|
639
|
+
/* ORDER BY 排序 */
|
|
640
|
+
PROC SQL;
|
|
641
|
+
SELECT * FROM employees
|
|
642
|
+
ORDER BY salary DESC, name ASC;
|
|
643
|
+
QUIT;
|
|
644
|
+
|
|
645
|
+
/* GROUP BY + 聚合 */
|
|
646
|
+
PROC SQL;
|
|
647
|
+
SELECT dept_id, COUNT(id) AS cnt, AVG(salary) AS avg_sal
|
|
648
|
+
FROM employees
|
|
649
|
+
GROUP BY dept_id;
|
|
650
|
+
QUIT;
|
|
651
|
+
|
|
652
|
+
/* HAVING 过滤分组 */
|
|
653
|
+
PROC SQL;
|
|
654
|
+
SELECT dept_id, COUNT(id) AS cnt
|
|
655
|
+
FROM employees
|
|
656
|
+
GROUP BY dept_id
|
|
657
|
+
HAVING COUNT(id) >= 2;
|
|
658
|
+
QUIT;
|
|
659
|
+
```
|
|
660
|
+
|
|
661
|
+
### JOIN
|
|
662
|
+
|
|
663
|
+
```sas
|
|
664
|
+
/* INNER JOIN */
|
|
665
|
+
PROC SQL;
|
|
666
|
+
SELECT e.name, d.dept_name
|
|
667
|
+
FROM employees e
|
|
668
|
+
INNER JOIN departments d ON e.dept_id = d.dept_id;
|
|
669
|
+
QUIT;
|
|
670
|
+
|
|
671
|
+
/* LEFT / RIGHT / FULL / CROSS JOIN */
|
|
672
|
+
PROC SQL;
|
|
673
|
+
SELECT e.name, d.dept_name
|
|
674
|
+
FROM employees e
|
|
675
|
+
LEFT JOIN departments d ON e.dept_id = d.dept_id;
|
|
676
|
+
QUIT;
|
|
677
|
+
|
|
678
|
+
/* 表别名支持 AS 关键字或裸别名 */
|
|
679
|
+
FROM employees AS e /* 带 AS */
|
|
680
|
+
FROM employees e /* 裸别名 */
|
|
681
|
+
```
|
|
682
|
+
|
|
683
|
+
### 集合操作
|
|
684
|
+
|
|
685
|
+
```sas
|
|
686
|
+
/* UNION(去重) */
|
|
687
|
+
PROC SQL;
|
|
688
|
+
SELECT name FROM t1
|
|
689
|
+
UNION
|
|
690
|
+
SELECT name FROM t2;
|
|
691
|
+
QUIT;
|
|
692
|
+
|
|
693
|
+
/* UNION ALL(不去重) */
|
|
694
|
+
PROC SQL;
|
|
695
|
+
SELECT name FROM t1
|
|
696
|
+
UNION ALL
|
|
697
|
+
SELECT name FROM t2;
|
|
698
|
+
QUIT;
|
|
699
|
+
|
|
700
|
+
/* INTERSECT(交集) */
|
|
701
|
+
PROC SQL;
|
|
702
|
+
SELECT name FROM t1
|
|
703
|
+
INTERSECT
|
|
704
|
+
SELECT name FROM t2;
|
|
705
|
+
QUIT;
|
|
706
|
+
|
|
707
|
+
/* EXCEPT(差集) */
|
|
708
|
+
PROC SQL;
|
|
709
|
+
SELECT name FROM t1
|
|
710
|
+
EXCEPT
|
|
711
|
+
SELECT name FROM t2;
|
|
712
|
+
QUIT;
|
|
713
|
+
```
|
|
714
|
+
|
|
715
|
+
### CREATE TABLE
|
|
716
|
+
|
|
717
|
+
```sas
|
|
718
|
+
PROC SQL;
|
|
719
|
+
CREATE TABLE high_salary AS
|
|
720
|
+
SELECT * FROM employees
|
|
721
|
+
WHERE salary >= 60000;
|
|
722
|
+
QUIT;
|
|
723
|
+
```
|
|
724
|
+
|
|
725
|
+
### INSERT INTO
|
|
726
|
+
|
|
727
|
+
```sas
|
|
728
|
+
/* 插入值 */
|
|
729
|
+
PROC SQL;
|
|
730
|
+
INSERT INTO employees VALUES(6, 'Frank', 10, 52000);
|
|
731
|
+
QUIT;
|
|
732
|
+
|
|
733
|
+
/* 指定列名 */
|
|
734
|
+
PROC SQL;
|
|
735
|
+
INSERT INTO employees (id, name, dept_id, salary)
|
|
736
|
+
VALUES(7, 'Gina', 30, 48000);
|
|
737
|
+
QUIT;
|
|
738
|
+
|
|
739
|
+
/* 从 SELECT 插入 */
|
|
740
|
+
PROC SQL;
|
|
741
|
+
INSERT INTO archive
|
|
742
|
+
SELECT * FROM employees WHERE dept_id = 10;
|
|
743
|
+
QUIT;
|
|
744
|
+
```
|
|
745
|
+
|
|
746
|
+
### UPDATE
|
|
747
|
+
|
|
748
|
+
```sas
|
|
749
|
+
PROC SQL;
|
|
750
|
+
UPDATE employees
|
|
751
|
+
SET salary = salary * 1.1
|
|
752
|
+
WHERE dept_id = 10;
|
|
753
|
+
QUIT;
|
|
754
|
+
|
|
755
|
+
/* 多列更新 */
|
|
756
|
+
PROC SQL;
|
|
757
|
+
UPDATE employees
|
|
758
|
+
SET salary = 99999, name = 'Updated'
|
|
759
|
+
WHERE id = 1;
|
|
760
|
+
QUIT;
|
|
761
|
+
```
|
|
762
|
+
|
|
763
|
+
### DELETE FROM
|
|
764
|
+
|
|
765
|
+
```sas
|
|
766
|
+
PROC SQL;
|
|
767
|
+
DELETE FROM employees
|
|
768
|
+
WHERE salary < 30000;
|
|
769
|
+
QUIT;
|
|
770
|
+
```
|
|
771
|
+
|
|
772
|
+
### 聚合函数
|
|
773
|
+
|
|
774
|
+
在 SELECT + GROUP BY 中支持的聚合函数:
|
|
775
|
+
|
|
776
|
+
`COUNT`, `SUM`, `MEAN`, `AVG`, `MIN`, `MAX`, `STD`, `MEDIAN`, `N`
|
|
777
|
+
|
|
778
|
+
```sas
|
|
779
|
+
PROC SQL;
|
|
780
|
+
SELECT dept_id,
|
|
781
|
+
COUNT(*) AS n,
|
|
782
|
+
SUM(salary) AS total,
|
|
783
|
+
MEAN(salary) AS avg_sal
|
|
784
|
+
FROM employees
|
|
785
|
+
GROUP BY dept_id;
|
|
786
|
+
QUIT;
|
|
787
|
+
```
|
|
788
|
+
|
|
789
|
+
### CASE WHEN 表达式
|
|
790
|
+
|
|
791
|
+
```sas
|
|
792
|
+
PROC SQL;
|
|
793
|
+
SELECT name, salary,
|
|
794
|
+
CASE
|
|
795
|
+
WHEN salary >= 70000 THEN 'High'
|
|
796
|
+
WHEN salary >= 50000 THEN 'Medium'
|
|
797
|
+
ELSE 'Low'
|
|
798
|
+
END AS salary_level
|
|
799
|
+
FROM employees;
|
|
800
|
+
QUIT;
|
|
801
|
+
|
|
802
|
+
/* 带 ELSE */
|
|
803
|
+
PROC SQL;
|
|
804
|
+
SELECT name, dept,
|
|
805
|
+
CASE dept
|
|
806
|
+
WHEN 'IT' THEN 'Technology'
|
|
807
|
+
WHEN 'HR' THEN 'Human Resources'
|
|
808
|
+
ELSE 'Other'
|
|
809
|
+
END AS dept_full
|
|
810
|
+
FROM employees;
|
|
811
|
+
QUIT;
|
|
812
|
+
```
|
|
813
|
+
|
|
814
|
+
### LIKE / BETWEEN / IS NULL
|
|
815
|
+
|
|
816
|
+
```sas
|
|
817
|
+
/* LIKE — 模式匹配 */
|
|
818
|
+
PROC SQL;
|
|
819
|
+
SELECT * FROM employees
|
|
820
|
+
WHERE name LIKE 'A%'; /* 以 A 开头 */
|
|
821
|
+
QUIT;
|
|
822
|
+
|
|
823
|
+
PROC SQL;
|
|
824
|
+
SELECT * FROM employees
|
|
825
|
+
WHERE name NOT LIKE '%e'; /* 不以 e 结尾 */
|
|
826
|
+
QUIT;
|
|
827
|
+
|
|
828
|
+
/* 通配符:% 匹配任意字符序列,_ 匹配单个字符 */
|
|
829
|
+
PROC SQL;
|
|
830
|
+
SELECT * FROM employees
|
|
831
|
+
WHERE name LIKE '_o%'; /* 第二个字符是 o */
|
|
832
|
+
QUIT;
|
|
833
|
+
|
|
834
|
+
/* BETWEEN — 范围查询 */
|
|
835
|
+
PROC SQL;
|
|
836
|
+
SELECT * FROM employees
|
|
837
|
+
WHERE salary BETWEEN 50000 AND 70000;
|
|
838
|
+
QUIT;
|
|
839
|
+
|
|
840
|
+
PROC SQL;
|
|
841
|
+
SELECT * FROM employees
|
|
842
|
+
WHERE salary NOT BETWEEN 50000 AND 70000;
|
|
843
|
+
QUIT;
|
|
844
|
+
|
|
845
|
+
/* IS NULL / IS NOT NULL — 空值判断 */
|
|
846
|
+
PROC SQL;
|
|
847
|
+
SELECT * FROM employees
|
|
848
|
+
WHERE manager IS NULL;
|
|
849
|
+
QUIT;
|
|
850
|
+
|
|
851
|
+
PROC SQL;
|
|
852
|
+
SELECT * FROM employees
|
|
853
|
+
WHERE email IS NOT NULL;
|
|
854
|
+
QUIT;
|
|
855
|
+
```
|
|
856
|
+
|
|
857
|
+
### EXISTS / NOT EXISTS — 子查询存在性检查
|
|
858
|
+
|
|
859
|
+
```sas
|
|
860
|
+
/* EXISTS — 检查子查询是否有结果 */
|
|
861
|
+
PROC SQL;
|
|
862
|
+
SELECT * FROM employees e
|
|
863
|
+
WHERE EXISTS (SELECT * FROM departments d WHERE d.dept_id = e.dept_id);
|
|
864
|
+
QUIT;
|
|
865
|
+
|
|
866
|
+
/* NOT EXISTS */
|
|
867
|
+
PROC SQL;
|
|
868
|
+
SELECT * FROM employees e
|
|
869
|
+
WHERE NOT EXISTS (SELECT * FROM departments d WHERE d.dept_id = e.dept_id);
|
|
870
|
+
QUIT;
|
|
871
|
+
```
|
|
872
|
+
|
|
873
|
+
### 未实现的 SQL 功能
|
|
874
|
+
|
|
875
|
+
| 功能 | 状态 |
|
|
876
|
+
|------|------|
|
|
877
|
+
| 子查询(FROM 中的子查询) | 语法支持,执行未测试 |
|
|
878
|
+
| 窗口函数(ROW_NUMBER, RANK 等) | 未实现 |
|
|
879
|
+
| 多表 UPDATE(FROM 子句) | 未实现 |
|
|
880
|
+
| ALTER TABLE | 未实现 |
|
|
881
|
+
| CREATE VIEW | 未实现 |
|
|
882
|
+
| CREATE INDEX | 未实现 |
|
|
883
|
+
|
|
884
|
+
---
|
|
885
|
+
|
|
886
|
+
## 5. LIBNAME — 库引用
|
|
887
|
+
|
|
888
|
+
```sas
|
|
889
|
+
/* 分配库路径 */
|
|
890
|
+
LIBNAME mylib "C:/data";
|
|
891
|
+
|
|
892
|
+
/* 使用库引用访问数据集 */
|
|
893
|
+
PROC PRINT DATA=mylib.employees; RUN;
|
|
894
|
+
|
|
895
|
+
PROC SQL;
|
|
896
|
+
CREATE TABLE mylib.output AS
|
|
897
|
+
SELECT * FROM employees
|
|
898
|
+
WHERE salary > 50000;
|
|
899
|
+
QUIT;
|
|
900
|
+
|
|
901
|
+
/* 清除库引用 */
|
|
902
|
+
LIBNAME mylib;
|
|
903
|
+
```
|
|
904
|
+
|
|
905
|
+
- `WORK` 是默认临时库,无需手动分配
|
|
906
|
+
- 库路径可以是绝对路径或相对路径
|
|
907
|
+
- LIBNAME 后指定的目录会自动创建(如不存在)
|
|
908
|
+
|
|
909
|
+
### FILENAME — 文件引用
|
|
910
|
+
|
|
911
|
+
```sas
|
|
912
|
+
/* 分配文件引用名 */
|
|
913
|
+
FILENAME myfile "C:/data/output.csv";
|
|
914
|
+
|
|
915
|
+
/* 文件引用名存入宏变量 _FILEREF_{name},可在后续代码中引用 */
|
|
916
|
+
/* 注意:FILENAME 的实际文件操作(读写)尚未完全实现 */
|
|
917
|
+
```
|
|
918
|
+
|
|
919
|
+
---
|
|
920
|
+
|
|
921
|
+
## 6. PROC 过程步
|
|
922
|
+
|
|
923
|
+
### PROC PRINT
|
|
924
|
+
|
|
925
|
+
```sas
|
|
926
|
+
/* 基本用法 */
|
|
927
|
+
PROC PRINT DATA=employees; RUN;
|
|
928
|
+
|
|
929
|
+
/* 指定变量 */
|
|
930
|
+
PROC PRINT DATA=employees;
|
|
931
|
+
VAR name salary dept_id;
|
|
932
|
+
RUN;
|
|
933
|
+
|
|
934
|
+
/* 带 ID 变量 */
|
|
935
|
+
PROC PRINT DATA=employees;
|
|
936
|
+
VAR name salary;
|
|
937
|
+
ID id;
|
|
938
|
+
RUN;
|
|
939
|
+
|
|
940
|
+
/* BY 分组显示(需先排序) */
|
|
941
|
+
PROC SORT DATA=employees OUT=sorted;
|
|
942
|
+
BY dept;
|
|
943
|
+
RUN;
|
|
944
|
+
PROC PRINT DATA=sorted;
|
|
945
|
+
VAR name salary;
|
|
946
|
+
BY dept;
|
|
947
|
+
RUN;
|
|
948
|
+
|
|
949
|
+
/* SUM 汇总(按 BY 组求和) */
|
|
950
|
+
PROC PRINT DATA=sorted;
|
|
951
|
+
VAR name salary;
|
|
952
|
+
BY dept;
|
|
953
|
+
SUM salary;
|
|
954
|
+
RUN;
|
|
955
|
+
```
|
|
956
|
+
|
|
957
|
+
### PROC SORT
|
|
958
|
+
|
|
959
|
+
```sas
|
|
960
|
+
/* 基本排序 */
|
|
961
|
+
PROC SORT DATA=employees OUT=sorted;
|
|
962
|
+
BY salary;
|
|
963
|
+
RUN;
|
|
964
|
+
|
|
965
|
+
/* 降序排序 */
|
|
966
|
+
PROC SORT DATA=employees OUT=sorted;
|
|
967
|
+
BY descending salary;
|
|
968
|
+
RUN;
|
|
969
|
+
|
|
970
|
+
/* 多字段排序 */
|
|
971
|
+
PROC SORT DATA=employees OUT=sorted;
|
|
972
|
+
BY dept_id descending salary;
|
|
973
|
+
RUN;
|
|
974
|
+
|
|
975
|
+
/* 去重 — 按 BY 变量保留第一条 */
|
|
976
|
+
PROC SORT DATA=employees OUT=unique_dept NODUPKEY;
|
|
977
|
+
BY dept_id;
|
|
978
|
+
RUN;
|
|
979
|
+
|
|
980
|
+
/* 去重 — 删除完全重复的行 */
|
|
981
|
+
PROC SORT DATA=employees OUT=no_dups NODUPRECS;
|
|
982
|
+
BY id;
|
|
983
|
+
RUN;
|
|
984
|
+
|
|
985
|
+
/* 就地排序(覆盖原数据集) */
|
|
986
|
+
PROC SORT DATA=employees;
|
|
987
|
+
BY salary;
|
|
988
|
+
RUN;
|
|
989
|
+
```
|
|
990
|
+
|
|
991
|
+
### PROC MEANS / SUMMARY
|
|
992
|
+
|
|
993
|
+
```sas
|
|
994
|
+
/* 显示所有数值变量的描述性统计 */
|
|
995
|
+
PROC MEANS DATA=employees;
|
|
996
|
+
VAR salary;
|
|
997
|
+
RUN;
|
|
998
|
+
|
|
999
|
+
/* PROC SUMMARY 与 PROC MEANS 等价 */
|
|
1000
|
+
PROC SUMMARY DATA=employees;
|
|
1001
|
+
VAR salary;
|
|
1002
|
+
RUN;
|
|
1003
|
+
|
|
1004
|
+
/* 指定统计量(N MEAN SUM MIN MAX STD MEDIAN) */
|
|
1005
|
+
PROC MEANS DATA=employees N MEAN SUM;
|
|
1006
|
+
VAR salary;
|
|
1007
|
+
RUN;
|
|
1008
|
+
|
|
1009
|
+
/* CLASS 分组统计 */
|
|
1010
|
+
PROC MEANS DATA=employees;
|
|
1011
|
+
VAR salary;
|
|
1012
|
+
CLASS dept;
|
|
1013
|
+
RUN;
|
|
1014
|
+
|
|
1015
|
+
/* BY 分组(需先排序) */
|
|
1016
|
+
PROC SORT DATA=employees OUT=sorted;
|
|
1017
|
+
BY dept;
|
|
1018
|
+
RUN;
|
|
1019
|
+
PROC MEANS DATA=sorted;
|
|
1020
|
+
VAR salary;
|
|
1021
|
+
BY dept;
|
|
1022
|
+
RUN;
|
|
1023
|
+
|
|
1024
|
+
/* OUT= 输出结果到数据集 */
|
|
1025
|
+
PROC MEANS DATA=employees OUT=summary;
|
|
1026
|
+
VAR salary;
|
|
1027
|
+
CLASS dept;
|
|
1028
|
+
RUN;
|
|
1029
|
+
PROC PRINT DATA=summary; RUN;
|
|
1030
|
+
|
|
1031
|
+
/* MAXDEC= 控制小数位数 */
|
|
1032
|
+
PROC MEANS DATA=employees MAXDEC=2 N MEAN STD;
|
|
1033
|
+
VAR salary;
|
|
1034
|
+
CLASS dept;
|
|
1035
|
+
RUN;
|
|
1036
|
+
```
|
|
1037
|
+
|
|
1038
|
+
默认输出:count, mean, std, min, 25%, 50%, 75%, max
|
|
1039
|
+
指定统计量时仅输出所选统计。
|
|
1040
|
+
|
|
1041
|
+
### PROC FREQ
|
|
1042
|
+
|
|
1043
|
+
```sas
|
|
1044
|
+
/* 单变量频率表 */
|
|
1045
|
+
PROC FREQ DATA=employees;
|
|
1046
|
+
TABLES dept_id;
|
|
1047
|
+
RUN;
|
|
1048
|
+
|
|
1049
|
+
/* 交叉表 */
|
|
1050
|
+
PROC FREQ DATA=employees;
|
|
1051
|
+
TABLES dept_id * gender;
|
|
1052
|
+
RUN;
|
|
1053
|
+
```
|
|
1054
|
+
|
|
1055
|
+
交叉表输出示例:
|
|
1056
|
+
```
|
|
1057
|
+
status N Y Total
|
|
1058
|
+
gender
|
|
1059
|
+
F 1 1 2
|
|
1060
|
+
M 1 2 3
|
|
1061
|
+
Total 2 3 5
|
|
1062
|
+
```
|
|
1063
|
+
|
|
1064
|
+
### PROC CONTENTS
|
|
1065
|
+
|
|
1066
|
+
```sas
|
|
1067
|
+
PROC CONTENTS DATA=employees; RUN;
|
|
1068
|
+
```
|
|
1069
|
+
|
|
1070
|
+
输出包括:引擎类型、观测数、变量数、每个变量的名称、类型(numeric/character)、长度。
|
|
1071
|
+
|
|
1072
|
+
### PROC IMPORT
|
|
1073
|
+
|
|
1074
|
+
```sas
|
|
1075
|
+
/* 导入 CSV */
|
|
1076
|
+
PROC IMPORT DATAFILE="path/to/file.csv"
|
|
1077
|
+
OUT=work.mydata
|
|
1078
|
+
DBMS=CSV;
|
|
1079
|
+
RUN;
|
|
1080
|
+
|
|
1081
|
+
/* 导入 TSV */
|
|
1082
|
+
PROC IMPORT DATAFILE="path/to/file.tsv"
|
|
1083
|
+
OUT=work.mydata
|
|
1084
|
+
DBMS=TAB;
|
|
1085
|
+
RUN;
|
|
1086
|
+
|
|
1087
|
+
/* 导入 Excel(需要 openpyxl) */
|
|
1088
|
+
PROC IMPORT DATAFILE="path/to/file.xlsx"
|
|
1089
|
+
OUT=work.mydata
|
|
1090
|
+
DBMS=XLSX;
|
|
1091
|
+
RUN;
|
|
1092
|
+
|
|
1093
|
+
/* 控制选项 */
|
|
1094
|
+
PROC IMPORT DATAFILE="data.csv"
|
|
1095
|
+
OUT=work.mydata
|
|
1096
|
+
DBMS=CSV
|
|
1097
|
+
GETNAMES=NO /* 第一行不作为列名 */
|
|
1098
|
+
DELIMITER="|"; /* 自定义分隔符 */
|
|
1099
|
+
RUN;
|
|
1100
|
+
```
|
|
1101
|
+
|
|
1102
|
+
**支持的 DBMS 类型**:`CSV`、`DLM`(分隔符)、`TAB`、`XLSX`/`EXCEL`
|
|
1103
|
+
|
|
1104
|
+
### PROC EXPORT
|
|
1105
|
+
|
|
1106
|
+
```sas
|
|
1107
|
+
/* 导出 CSV */
|
|
1108
|
+
PROC EXPORT DATA=employees
|
|
1109
|
+
OUTFILE="output.csv"
|
|
1110
|
+
DBMS=CSV;
|
|
1111
|
+
RUN;
|
|
1112
|
+
|
|
1113
|
+
/* 导出 TSV */
|
|
1114
|
+
PROC EXPORT DATA=employees
|
|
1115
|
+
OUTFILE="output.tsv"
|
|
1116
|
+
DBMS=TAB;
|
|
1117
|
+
RUN;
|
|
1118
|
+
|
|
1119
|
+
/* 自定义分隔符 */
|
|
1120
|
+
PROC EXPORT DATA=employees
|
|
1121
|
+
OUTFILE="output.txt"
|
|
1122
|
+
DBMS=DLM
|
|
1123
|
+
DELIMITER="|";
|
|
1124
|
+
RUN;
|
|
1125
|
+
```
|
|
1126
|
+
|
|
1127
|
+
### PROC APPEND
|
|
1128
|
+
|
|
1129
|
+
```sas
|
|
1130
|
+
/* PROC APPEND — 将 DATA= 数据集追加到 BASE= 数据集尾部 */
|
|
1131
|
+
DATA base;
|
|
1132
|
+
INPUT x y;
|
|
1133
|
+
DATALINES;
|
|
1134
|
+
1 2
|
|
1135
|
+
3 4
|
|
1136
|
+
;
|
|
1137
|
+
RUN;
|
|
1138
|
+
|
|
1139
|
+
DATA extra;
|
|
1140
|
+
INPUT x y;
|
|
1141
|
+
DATALINES;
|
|
1142
|
+
5 6
|
|
1143
|
+
;
|
|
1144
|
+
RUN;
|
|
1145
|
+
|
|
1146
|
+
PROC APPEND BASE=base DATA=extra;
|
|
1147
|
+
RUN;
|
|
1148
|
+
/* 结果:base 变为 3 行,包含所有记录 */
|
|
1149
|
+
```
|
|
1150
|
+
|
|
1151
|
+
### PROC DATASETS
|
|
1152
|
+
|
|
1153
|
+
```sas
|
|
1154
|
+
/* PROC DATASETS — 管理数据集(删除、重命名、查看) */
|
|
1155
|
+
PROC DATASETS;
|
|
1156
|
+
DELETE olddata1 olddata2; /* 删除指定数据集 */
|
|
1157
|
+
QUIT;
|
|
1158
|
+
|
|
1159
|
+
/* MODIFY + RENAME */
|
|
1160
|
+
PROC DATASETS;
|
|
1161
|
+
MODIFY mydata RENAME oldvar = newvar;
|
|
1162
|
+
QUIT;
|
|
1163
|
+
|
|
1164
|
+
/* 指定 LIBRARY */
|
|
1165
|
+
PROC DATASETS LIBRARY=mylib;
|
|
1166
|
+
CONTENTS DATA=specific_ds; /* 查看单个数据集详情 */
|
|
1167
|
+
QUIT;
|
|
1168
|
+
```
|
|
1169
|
+
|
|
1170
|
+
> **注意**:PROC DATASETS 必须以 `QUIT;` 结束。
|
|
1171
|
+
|
|
1172
|
+
### 未实现/部分实现的 PROC
|
|
1173
|
+
|
|
1174
|
+
| PROC | 状态 |
|
|
1175
|
+
|------|------|
|
|
1176
|
+
| PROC MEANS 的 NOPRINT | 已实现:抑制输出,OUT= 数据集仍正常写入 |
|
|
1177
|
+
| PROC TABULATE | 未实现 |
|
|
1178
|
+
| PROC TRANSPOSE | 未实现 |
|
|
1179
|
+
| PROC REPORT | 未实现 |
|
|
1180
|
+
| PROC GPLOT / GCHART | 未实现(图形) |
|
|
1181
|
+
| PROC REG / LOGISTIC | 未实现(统计建模) |
|
|
1182
|
+
| PROC SQL 的 QUIT 后代码 | 不支持(QUIT 必须是最后一条) |
|
|
1183
|
+
|
|
1184
|
+
---
|
|
1185
|
+
|
|
1186
|
+
## 7. 宏系统
|
|
1187
|
+
|
|
1188
|
+
### %LET — 宏变量赋值
|
|
1189
|
+
|
|
1190
|
+
```sas
|
|
1191
|
+
%LET threshold = 55000;
|
|
1192
|
+
%LET dataset = employees;
|
|
1193
|
+
%LET msg = Hello World;
|
|
1194
|
+
|
|
1195
|
+
/* 使用 & 引用 */
|
|
1196
|
+
DATA high;
|
|
1197
|
+
SET &dataset;
|
|
1198
|
+
IF salary > &threshold;
|
|
1199
|
+
RUN;
|
|
1200
|
+
```
|
|
1201
|
+
|
|
1202
|
+
### && — 间接引用
|
|
1203
|
+
|
|
1204
|
+
```sas
|
|
1205
|
+
%LET x = name;
|
|
1206
|
+
%LET col_&x = employee_name;
|
|
1207
|
+
|
|
1208
|
+
/* && 先解析为 &,再解析为值 */
|
|
1209
|
+
%LET result = &&col_&x;
|
|
1210
|
+
```
|
|
1211
|
+
|
|
1212
|
+
### %MACRO / %MEND — 宏定义
|
|
1213
|
+
|
|
1214
|
+
```sas
|
|
1215
|
+
/* 定义无参宏 */
|
|
1216
|
+
%MACRO create_report;
|
|
1217
|
+
PROC PRINT DATA=employees; RUN;
|
|
1218
|
+
PROC MEANS DATA=employees; VAR salary; RUN;
|
|
1219
|
+
%MEND create_report;
|
|
1220
|
+
|
|
1221
|
+
/* 调用宏 */
|
|
1222
|
+
%create_report;
|
|
1223
|
+
```
|
|
1224
|
+
|
|
1225
|
+
```sas
|
|
1226
|
+
/* 定义带参宏 */
|
|
1227
|
+
%MACRO filter_data(dsname, col, threshold);
|
|
1228
|
+
DATA filtered;
|
|
1229
|
+
SET &dsname;
|
|
1230
|
+
IF &col >= &threshold;
|
|
1231
|
+
RUN;
|
|
1232
|
+
%MEND;
|
|
1233
|
+
|
|
1234
|
+
/* 调用带参宏 */
|
|
1235
|
+
%filter_data(work.employees, salary, 60000);
|
|
1236
|
+
```
|
|
1237
|
+
|
|
1238
|
+
### %IF / %THEN / %ELSE — 宏条件编译
|
|
1239
|
+
|
|
1240
|
+
```sas
|
|
1241
|
+
%LET env = PROD;
|
|
1242
|
+
|
|
1243
|
+
%IF &env = PROD %THEN %DO;
|
|
1244
|
+
DATA config;
|
|
1245
|
+
debug = 0;
|
|
1246
|
+
log_level = 'ERROR';
|
|
1247
|
+
RUN;
|
|
1248
|
+
%END;
|
|
1249
|
+
%ELSE %DO;
|
|
1250
|
+
DATA config;
|
|
1251
|
+
debug = 1;
|
|
1252
|
+
log_level = 'DEBUG';
|
|
1253
|
+
RUN;
|
|
1254
|
+
%END;
|
|
1255
|
+
```
|
|
1256
|
+
|
|
1257
|
+
支持的比较运算符:`=`、`NE`、`<>`、`>`、`>=`、`<`、`<=`、`GT`、`GE`、`LT`、`LE`
|
|
1258
|
+
|
|
1259
|
+
### %PUT — 宏打印
|
|
1260
|
+
|
|
1261
|
+
```sas
|
|
1262
|
+
%LET msg = Hello;
|
|
1263
|
+
%PUT &msg World; /* 输出到日志: Hello World */
|
|
1264
|
+
%PUT NOTE: Processing done.; /* 输出带前缀的信息 */
|
|
1265
|
+
```
|
|
1266
|
+
|
|
1267
|
+
### %DO %TO %BY — 宏循环
|
|
1268
|
+
|
|
1269
|
+
```sas
|
|
1270
|
+
/* 基本循环 */
|
|
1271
|
+
%DO i = 1 %TO 5;
|
|
1272
|
+
%PUT i = &i;
|
|
1273
|
+
%END;
|
|
1274
|
+
|
|
1275
|
+
/* 带 BY 步长 */
|
|
1276
|
+
%LET s = 0;
|
|
1277
|
+
%DO i = 1 %TO 10 %BY 2;
|
|
1278
|
+
%LET s = %EVAL(&s + &i);
|
|
1279
|
+
%END;
|
|
1280
|
+
```
|
|
1281
|
+
|
|
1282
|
+
### %EVAL / %SYSEVALF — 宏表达式求值
|
|
1283
|
+
|
|
1284
|
+
```sas
|
|
1285
|
+
/* %EVAL — 整数算术求值 */
|
|
1286
|
+
%LET x = %EVAL(3 + 4 * 2); /* x = 11 */
|
|
1287
|
+
|
|
1288
|
+
/* 支持 + - * / 和括号 */
|
|
1289
|
+
%LET result = %EVAL((100 - 20) / 4);
|
|
1290
|
+
```
|
|
1291
|
+
|
|
1292
|
+
### 注释
|
|
1293
|
+
|
|
1294
|
+
```sas
|
|
1295
|
+
/* 块注释 — 可跨行 */
|
|
1296
|
+
%LET x = 1; /* 行内注释 */
|
|
1297
|
+
|
|
1298
|
+
/* 块注释会被宏预处理器移除 */
|
|
1299
|
+
/* 嵌套注释支持有限 — 最外层 /* 内层 */ 会提前关闭 */
|
|
1300
|
+
```
|
|
1301
|
+
|
|
1302
|
+
### 未实现的宏功能
|
|
1303
|
+
|
|
1304
|
+
| 功能 | 状态 |
|
|
1305
|
+
|------|------|
|
|
1306
|
+
| %INCLUDE(文件包含) | 未实现 |
|
|
1307
|
+
| %GOTO / %LABEL | 未实现 |
|
|
1308
|
+
| %RETURN | 未实现 |
|
|
1309
|
+
| %SCAN 宏函数 | 未实现(有 DATA step SCAN) |
|
|
1310
|
+
| %SUBSTR 宏函数 | 未实现(有 DATA step SUBSTR) |
|
|
1311
|
+
| %SYSFUNC | 未实现 |
|
|
1312
|
+
| 宏嵌套调用 | 有限支持(不支持递归) |
|
|
1313
|
+
| 全局/本地宏变量作用域 | 有限实现 |
|
|
1314
|
+
| 自动宏变量(SYSDATE, SYSTIME, SYSERR 等) | 未实现 |
|
|
1315
|
+
|
|
1316
|
+
---
|
|
1317
|
+
|
|
1318
|
+
## 8. 内置函数参考
|
|
1319
|
+
|
|
1320
|
+
### 字符函数(22 个)
|
|
1321
|
+
|
|
1322
|
+
| 函数 | 语法 | 说明 |
|
|
1323
|
+
|------|------|------|
|
|
1324
|
+
| `SUBSTR` | `SUBSTR(string, start [, length])` | 提取子串(从 1 开始) |
|
|
1325
|
+
| `SCAN` | `SCAN(string, n [, delimiters])` | 提取第 n 个单词 |
|
|
1326
|
+
| `COMPRESS` | `COMPRESS(string [, chars [, modifiers]])` | 移除/保留指定字符。modifiers: K=保留, L=转小写, U=转大写 |
|
|
1327
|
+
| `UPCASE` | `UPCASE(string)` | 转大写 |
|
|
1328
|
+
| `LOWCASE` | `LOWCASE(string)` | 转小写 |
|
|
1329
|
+
| `STRIP` | `STRIP(string)` | 去除首尾空格 |
|
|
1330
|
+
| `TRIM` | `TRIM(string)` | 去除尾部空格 |
|
|
1331
|
+
| `LEFT` | `LEFT(string)` | 左对齐 |
|
|
1332
|
+
| `CAT` | `CAT(s1, s2, ...)` | 连接字符串 |
|
|
1333
|
+
| `CATS` | `CATS(s1, s2, ...)` | 连接字符串(自动 strip) |
|
|
1334
|
+
| `CATX` | `CATX(sep, s1, s2, ...)` | 用分隔符连接字符串 |
|
|
1335
|
+
| `COMPBL` | `COMPBL(string)` | 多个连续空格压缩为一个 |
|
|
1336
|
+
| `TRANWRD` | `TRANWRD(string, find, replace)` | 替换子串 |
|
|
1337
|
+
| `INDEX` | `INDEX(string, substring)` | 查找子串位置(未找到返回 0) |
|
|
1338
|
+
| `FIND` | `FIND(string, substring [, modifiers])` | 查找子串。modifiers: I=忽略大小写 |
|
|
1339
|
+
| `COUNT` | `COUNT(string, substring)` | 统计子串出现次数 |
|
|
1340
|
+
| `REPEAT` | `REPEAT(string, n)` | 重复字符串 n 次 |
|
|
1341
|
+
| `REVERSE` | `REVERSE(string)` | 反转字符串 |
|
|
1342
|
+
| `LENGTH` | `LENGTH(string)` | 字符串长度(去除尾部空格) |
|
|
1343
|
+
| `LENGTHC` | `LENGTHC(string)` | 字符串长度(含尾部空格) |
|
|
1344
|
+
| `MISSING` | `MISSING(var)` | 测试是否缺失(缺失返回 1,否则 0) |
|
|
1345
|
+
| `COALESCEC` | `COALESCEC(s1, s2, ...)` | 返回第一个非缺失字符串 |
|
|
1346
|
+
|
|
1347
|
+
### 数值函数(23 个)
|
|
1348
|
+
|
|
1349
|
+
| 函数 | 语法 | 说明 |
|
|
1350
|
+
|------|------|------|
|
|
1351
|
+
| `SUM` | `SUM(n1, n2, ...)` | 求和(忽略缺失值) |
|
|
1352
|
+
| `MEAN` | `MEAN(n1, n2, ...)` | 均值(忽略缺失值) |
|
|
1353
|
+
| `MIN` | `MIN(n1, n2, ...)` | 最小值 |
|
|
1354
|
+
| `MAX` | `MAX(n1, n2, ...)` | 最大值 |
|
|
1355
|
+
| `N` | `N(n1, n2, ...)` | 非缺失值个数 |
|
|
1356
|
+
| `NMISS` | `NMISS(n1, n2, ...)` | 缺失值个数 |
|
|
1357
|
+
| `ROUND` | `ROUND(number [, unit])` | 四舍五入到指定精度(默认 1) |
|
|
1358
|
+
| `INT` | `INT(number)` | 取整(截断小数) |
|
|
1359
|
+
| `MOD` | `MOD(dividend, divisor)` | 取模 |
|
|
1360
|
+
| `CEIL` | `CEIL(number)` | 向上取整 |
|
|
1361
|
+
| `FLOOR` | `FLOOR(number)` | 向下取整 |
|
|
1362
|
+
| `ABS` | `ABS(number)` | 绝对值 |
|
|
1363
|
+
| `SQRT` | `SQRT(number)` | 平方根 |
|
|
1364
|
+
| `LOG` | `LOG(number)` | 自然对数 |
|
|
1365
|
+
| `LOG10` | `LOG10(number)` | 以 10 为底的对数 |
|
|
1366
|
+
| `EXP` | `EXP(number)` | e 的 n 次方 |
|
|
1367
|
+
| `SIN` | `SIN(number)` | 正弦 |
|
|
1368
|
+
| `COS` | `COS(number)` | 余弦 |
|
|
1369
|
+
| `TAN` | `TAN(number)` | 正切 |
|
|
1370
|
+
| `SIGN` | `SIGN(number)` | 符号(-1, 0, 1) |
|
|
1371
|
+
| `STD` | `STD(n1, n2, ...)` | 标准差(至少 2 个值) |
|
|
1372
|
+
| `RANGE` | `RANGE(n1, n2, ...)` | 极差(max - min) |
|
|
1373
|
+
| `MEDIAN` | `MEDIAN(n1, n2, ...)` | 中位数 |
|
|
1374
|
+
|
|
1375
|
+
### 日期函数(14 个)
|
|
1376
|
+
|
|
1377
|
+
| 函数 | 语法 | 说明 |
|
|
1378
|
+
|------|------|------|
|
|
1379
|
+
| `TODAY` | `TODAY()` | 当前日期(SAS 日期值,距 1960-01-01 的天数) |
|
|
1380
|
+
| `DATE` | `DATE()` | 同 TODAY |
|
|
1381
|
+
| `DATETIME` | `DATETIME()` | 当前日期时间(距 1960-01-01 的秒数) |
|
|
1382
|
+
| `MDY` | `MDY(month, day, year)` | 从月、日、年创建日期 |
|
|
1383
|
+
| `YEAR` | `YEAR(date)` | 提取年份 |
|
|
1384
|
+
| `MONTH` | `MONTH(date)` | 提取月份(1-12) |
|
|
1385
|
+
| `DAY` | `DAY(date)` | 提取日(1-31) |
|
|
1386
|
+
| `WEEKDAY` | `WEEKDAY(date)` | 星期几(1=周日 ... 7=周六) |
|
|
1387
|
+
| `QTR` | `QTR(date)` | 季度(1-4) |
|
|
1388
|
+
| `INTNX` | `INTNX(interval, start, increment [, align])` | 按间隔递增日期。interval: DAY/WEEK/MONTH/QTR/YEAR/HOUR/MINUTE/SECOND。align: B=起始, E=结束, M=中间 |
|
|
1389
|
+
| `INTCK` | `INTCK(interval, start, end)` | 计算两个日期间的间隔数 |
|
|
1390
|
+
| `DATEPART` | `DATEPART(datetime)` | 提取日期部分 |
|
|
1391
|
+
| `TIMEPART` | `TIMEPART(datetime)` | 提取时间部分 |
|
|
1392
|
+
| `DATEDIF` | `DATEDIF(start, end, unit)` | 计算两个日期之间的天/周/月/季/年差。unit: DAY/WEEK/MONTH/QTR/YEAR |
|
|
1393
|
+
|
|
1394
|
+
### 转换函数(2 个)
|
|
1395
|
+
|
|
1396
|
+
| 函数 | 语法 | 说明 |
|
|
1397
|
+
|------|------|------|
|
|
1398
|
+
| `INPUT` | `INPUT(source, informat)` | 将字符串转换为数值/日期。支持:BEST, F, COMMA, DOLLAR, DATE, DATE9, DDMMYY, MMDDYY, YYMMDD, MONYY, DATETIME |
|
|
1399
|
+
| `PUT` | `PUT(source, format)` | 将数值/日期转换为字符串。支持:width.dec, DATE9, MMDDYY10, YYMMDD10 |
|
|
1400
|
+
|
|
1401
|
+
### 条件函数(3 个)
|
|
1402
|
+
|
|
1403
|
+
| 函数 | 语法 | 说明 |
|
|
1404
|
+
|------|------|------|
|
|
1405
|
+
| `IFC` | `IFC(condition, true_val, false_val [, missing_val])` | 条件返回字符串 |
|
|
1406
|
+
| `IFN` | `IFN(condition, true_val, false_val [, missing_val])` | 条件返回数值 |
|
|
1407
|
+
| `COALESCE` | `COALESCE(n1, n2, ...)` | 返回第一个非缺失数值 |
|
|
1408
|
+
|
|
1409
|
+
### SQL 聚合函数
|
|
1410
|
+
|
|
1411
|
+
在 PROC SQL SELECT + GROUP BY 中可使用的聚合:
|
|
1412
|
+
|
|
1413
|
+
`COUNT`、`SUM`、`MEAN`/`AVG`、`MIN`、`MAX`、`STD`、`MEDIAN`、`N`
|
|
1414
|
+
|
|
1415
|
+
### 未实现的常用 SAS 函数
|
|
1416
|
+
|
|
1417
|
+
| 函数 | 说明 |
|
|
1418
|
+
|------|------|
|
|
1419
|
+
| `RANUNIF` / `RANNOR` | 随机数 |
|
|
1420
|
+
| `PROBNORM` / `PROBT` / `PROBF` | 概率分布 |
|
|
1421
|
+
| `PUTN` / `INPUTN` | 动态格式/输入 |
|
|
1422
|
+
| `STRIPE` | 去除所有空格 |
|
|
1423
|
+
| `TRANSLATE` | 字符替换(逐字符) |
|
|
1424
|
+
| `RANK` | 返回字符的 ASCII 码 |
|
|
1425
|
+
| `BYTE` | 返回 ASCII 码对应的字符 |
|
|
1426
|
+
| `VERIFY` | 返回第一个不在指定集合中的字符位置 |
|
|
1427
|
+
|
|
1428
|
+
---
|
|
1429
|
+
|
|
1430
|
+
## 9. 表达式与运算符
|
|
1431
|
+
|
|
1432
|
+
### 运算符优先级(从低到高)
|
|
1433
|
+
|
|
1434
|
+
| 优先级 | 运算符 | 说明 | 示例 |
|
|
1435
|
+
|--------|--------|------|------|
|
|
1436
|
+
| 1 | `OR` | 逻辑或 | `x = 1 OR y = 2` |
|
|
1437
|
+
| 2 | `AND` | 逻辑与 | `x = 1 AND y > 0` |
|
|
1438
|
+
| 3 | `NOT` | 逻辑非 | `NOT missing(x)` |
|
|
1439
|
+
| 4 | `=` `NE` `<>` `>` `>=` `<` `<=` `IN` | 比较 | `salary >= 50000` |
|
|
1440
|
+
| 5 | `\|\|` | 字符串连接 | `first \|\| ' ' \|\| last` |
|
|
1441
|
+
| 6 | `+` `-` | 加减 | `a + b - c` |
|
|
1442
|
+
| 7 | `*` `/` | 乘除 | `price * qty` |
|
|
1443
|
+
| 8 | 单目 `-` `+` | 正负号 | `-x` |
|
|
1444
|
+
|
|
1445
|
+
### 比较运算符
|
|
1446
|
+
|
|
1447
|
+
| 运算符 | 含义 | 说明 |
|
|
1448
|
+
|--------|------|------|
|
|
1449
|
+
| `=` | 等于 | 缺失值:`.` = `.` 为 True |
|
|
1450
|
+
| `NE` 或 `<>` | 不等于 | |
|
|
1451
|
+
| `>` | 大于 | |
|
|
1452
|
+
| `>=` | 大于等于 | |
|
|
1453
|
+
| `<` | 小于 | |
|
|
1454
|
+
| `<=` | 小于等于 | |
|
|
1455
|
+
| `IN` | 在列表中 | `dept_id IN (10, 20, 30)` |
|
|
1456
|
+
| `LIKE` | 模式匹配 | `%` 匹配任意字符序列,`_` 匹配单个字符 |
|
|
1457
|
+
| `NOT LIKE` | 模式不匹配 | |
|
|
1458
|
+
| `BETWEEN` | 范围判断 | `salary BETWEEN 50000 AND 70000` |
|
|
1459
|
+
| `NOT BETWEEN` | 范围外判断 | |
|
|
1460
|
+
| `IS NULL` | 是否为空 | `name IS NULL` |
|
|
1461
|
+
| `IS NOT NULL` | 是否非空 | `name IS NOT NULL` |
|
|
1462
|
+
|
|
1463
|
+
### CASE WHEN 表达式
|
|
1464
|
+
|
|
1465
|
+
```sas
|
|
1466
|
+
/* 搜索式 CASE */
|
|
1467
|
+
CASE WHEN cond1 THEN result1
|
|
1468
|
+
WHEN cond2 THEN result2
|
|
1469
|
+
ELSE default_result
|
|
1470
|
+
END
|
|
1471
|
+
|
|
1472
|
+
/* 简单式 CASE */
|
|
1473
|
+
CASE expr WHEN val1 THEN result1
|
|
1474
|
+
WHEN val2 THEN result2
|
|
1475
|
+
ELSE default_result
|
|
1476
|
+
END
|
|
1477
|
+
```
|
|
1478
|
+
|
|
1479
|
+
### 字面量
|
|
1480
|
+
|
|
1481
|
+
```sas
|
|
1482
|
+
42 /* 整数 */
|
|
1483
|
+
3.14 /* 小数 */
|
|
1484
|
+
'Hello' /* 字符串(单引号) */
|
|
1485
|
+
"World" /* 字符串(双引号) */
|
|
1486
|
+
NULL /* 空值 */
|
|
1487
|
+
```
|
|
1488
|
+
|
|
1489
|
+
### 函数调用
|
|
1490
|
+
|
|
1491
|
+
```sas
|
|
1492
|
+
UPCASE(name) /* 无参或单参 */
|
|
1493
|
+
SUBSTR(name, 1, 3) /* 多参 */
|
|
1494
|
+
CATS(first, ' ', last) /* 可变参数 */
|
|
1495
|
+
SUM(a, b, c, d) /* 可变参数 */
|
|
1496
|
+
```
|
|
1497
|
+
|
|
1498
|
+
---
|
|
1499
|
+
|
|
1500
|
+
## 10. 已知限制
|
|
1501
|
+
|
|
1502
|
+
1. **性能**:所有运算基于 pandas DataFrame,大数据集(>100 万行)可能较慢
|
|
1503
|
+
2. **内存**:所有数据集驻留在内存中,不支持磁盘缓存
|
|
1504
|
+
3. **并发**:不支持多线程或并行执行
|
|
1505
|
+
4. **精度**:浮点运算使用 IEEE 754 双精度,与 SAS 的精度有微小差异
|
|
1506
|
+
5. **缺失值**:数值缺失用 `NaN` 表示,字符串缺失用 `""` 表示,`''` 和 `""` 在 DATALINES 中均视为空值。SAS 的 `.`(数值型缺失)映射为 `NaN`。
|
|
1507
|
+
6. **日期**:日期基于 1960-01-01 的天数(与 SAS 一致),但时区处理不完整
|
|
1508
|
+
7. **FORMAT/LABEL**:仅存储为元数据,不改变数据的实际显示格式
|
|
1509
|
+
|
|
1510
|
+
---
|
|
1511
|
+
|
|
1512
|
+
## 11. 未实现的 SAS 功能
|
|
1513
|
+
|
|
1514
|
+
以下功能在 SAS 中存在但 SASLite 当前不支持:
|
|
1515
|
+
|
|
1516
|
+
### 数据步
|
|
1517
|
+
- 双 SET 并行读取
|
|
1518
|
+
- MODIFY 语句
|
|
1519
|
+
- INPUT 列模式 / 格式化模式
|
|
1520
|
+
- CALL 子程序(除 SYMPUT 外)
|
|
1521
|
+
|
|
1522
|
+
### 过程步
|
|
1523
|
+
- PROC TABULATE
|
|
1524
|
+
- PROC TRANSPOSE
|
|
1525
|
+
- PROC REPORT
|
|
1526
|
+
- PROC REG / LOGISTIC / GLM(统计建模)
|
|
1527
|
+
- PROC UNIVARIATE
|
|
1528
|
+
- PROC GPLOT / GCHART(图形)
|
|
1529
|
+
- PROC FORMAT(自定义格式)
|
|
1530
|
+
- PROC COMPARE
|
|
1531
|
+
- PROC COPY
|
|
1532
|
+
|
|
1533
|
+
### 宏
|
|
1534
|
+
- %INCLUDE
|
|
1535
|
+
- %GOTO / %LABEL
|
|
1536
|
+
- %RETURN
|
|
1537
|
+
- %SYSFUNC
|
|
1538
|
+
- %SCAN / %SUBSTR 宏函数
|
|
1539
|
+
- 宏递归
|
|
1540
|
+
- 自动宏变量(SYSDATE, SYSTIME, SYSERR 等)
|
|
1541
|
+
|
|
1542
|
+
### 全局
|
|
1543
|
+
- OPTIONS 设置(仅解析,不改变行为)
|
|
1544
|
+
- ODS 输出分发系统
|
|
1545
|
+
- 数据库连接(ODBC, JDBC)
|
|
1546
|
+
- PROC HTTP / JSON
|
|
1547
|
+
- 窗口函数(SQL)
|
|
1548
|
+
|
|
1549
|
+
---
|
|
1550
|
+
|
|
1551
|
+
> SASLite 的目标是在纯 Python 环境中提供足够完整的 SAS 语法子集,用于数据处理、清洗和转换任务。它不是 SAS 的完整替代品,而是一个兼容层,让熟悉 SAS 语法的用户能够快速上手 Python 数据处理。
|