openaivec 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openaivec/_serialize.py CHANGED
@@ -1,31 +1,10 @@
1
- """Serialization utilities for Pydantic BaseModel classes.
1
+ """Refactored serialization utilities for Pydantic BaseModel classes.
2
2
 
3
3
  This module provides utilities for converting Pydantic BaseModel classes
4
- to and from JSON schema representations. It supports dynamic model creation
5
- from JSON schemas with special handling for enum fields, which are converted
6
- to Literal types for better type safety and compatibility.
7
-
8
- Example:
9
- Basic serialization and deserialization:
10
-
11
- ```python
12
- from pydantic import BaseModel
13
- from typing import Literal
14
-
15
- class Status(BaseModel):
16
- value: Literal["active", "inactive"]
17
- description: str
18
-
19
- # Serialize to JSON schema
20
- schema = serialize_base_model(Status)
21
-
22
- # Deserialize back to BaseModel class
23
- DynamicStatus = deserialize_base_model(schema)
24
- instance = DynamicStatus(value="active", description="User is active")
25
- ```
4
+ to and from JSON schema representations with simplified, maintainable code.
26
5
  """
27
6
 
28
- from typing import Any, Dict, List, Literal, Type
7
+ from typing import Any, Dict, List, Literal, Tuple, Type, Union
29
8
 
30
9
  from pydantic import BaseModel, Field, create_model
31
10
 
@@ -33,66 +12,38 @@ __all__ = []
33
12
 
34
13
 
35
14
  def serialize_base_model(obj: Type[BaseModel]) -> Dict[str, Any]:
36
- """Serialize a Pydantic BaseModel to JSON schema.
37
-
38
- Args:
39
- obj (Type[BaseModel]): The Pydantic BaseModel class to serialize.
40
-
41
- Returns:
42
- A dictionary containing the JSON schema representation of the model.
43
-
44
- Example:
45
- ```python
46
- from pydantic import BaseModel
47
-
48
- class Person(BaseModel):
49
- name: str
50
- age: int
51
-
52
- schema = serialize_base_model(Person)
53
- ```
54
- """
15
+ """Serialize a Pydantic BaseModel to JSON schema."""
55
16
  return obj.model_json_schema()
56
17
 
57
18
 
58
19
  def dereference_json_schema(json_schema: Dict[str, Any]) -> Dict[str, Any]:
59
- """Dereference JSON schema by resolving $ref pointers.
60
-
61
- This function resolves all $ref references in a JSON schema by replacing
62
- them with the actual referenced definitions from the $defs section.
63
-
64
- Args:
65
- json_schema (Dict[str, Any]): The JSON schema containing potential $ref references.
66
-
67
- Returns:
68
- A dereferenced JSON schema with all $ref pointers resolved.
69
-
70
- Example:
71
- ```python
72
- schema = {
73
- "properties": {
74
- "user": {"$ref": "#/$defs/User"}
75
- },
76
- "$defs": {
77
- "User": {"type": "object", "properties": {"name": {"type": "string"}}}
78
- }
79
- }
80
- dereferenced = dereference_json_schema(schema)
81
- # user property will contain the actual User definition
82
- ```
83
- """
20
+ """Dereference JSON schema by resolving $ref pointers with circular reference protection."""
84
21
  model_map = json_schema.get("$defs", {})
85
22
 
86
- def dereference(obj):
23
+ def dereference(obj, current_path=None):
24
+ if current_path is None:
25
+ current_path = []
26
+
87
27
  if isinstance(obj, dict):
88
28
  if "$ref" in obj:
89
29
  ref = obj["$ref"].split("/")[-1]
90
- return dereference(model_map[ref])
91
- else:
92
- return {k: dereference(v) for k, v in obj.items()}
93
30
 
31
+ # Check for circular reference
32
+ if ref in current_path:
33
+ # Return a placeholder to break the cycle
34
+ return {"type": "object", "description": f"Circular reference to {ref}"}
35
+
36
+ if ref in model_map:
37
+ # Add to path and recurse
38
+ new_path = current_path + [ref]
39
+ return dereference(model_map[ref], new_path)
40
+ else:
41
+ # Invalid reference, return placeholder
42
+ return {"type": "object", "description": f"Invalid reference to {ref}"}
43
+ else:
44
+ return {k: dereference(v, current_path) for k, v in obj.items()}
94
45
  elif isinstance(obj, list):
95
- return [dereference(x) for x in obj]
46
+ return [dereference(x, current_path) for x in obj]
96
47
  else:
97
48
  return obj
98
49
 
@@ -100,134 +51,180 @@ def dereference_json_schema(json_schema: Dict[str, Any]) -> Dict[str, Any]:
100
51
  for k, v in json_schema.items():
101
52
  if k == "$defs":
102
53
  continue
103
-
104
54
  result[k] = dereference(v)
105
55
 
106
56
  return result
107
57
 
108
58
 
109
- def parse_field(v: Dict[str, Any]) -> Any:
59
+ # ============================================================================
60
+ # Type Resolution - Separated into focused functions
61
+ # ============================================================================
62
+
63
+
64
+ def _resolve_union_type(union_options: List[Dict[str, Any]]) -> Type:
65
+ """Resolve anyOf/oneOf to Union type."""
66
+ union_types = []
67
+ for option in union_options:
68
+ if option.get("type") == "null":
69
+ union_types.append(type(None))
70
+ else:
71
+ union_types.append(parse_field(option))
72
+
73
+ if len(union_types) == 1:
74
+ return union_types[0]
75
+ elif len(union_types) == 2 and type(None) in union_types:
76
+ # Optional type: T | None
77
+ non_none_type = next(t for t in union_types if t is not type(None))
78
+ return Union[non_none_type, type(None)] # type: ignore[return-value]
79
+ else:
80
+ return Union[tuple(union_types)] # type: ignore[return-value]
81
+
82
+
83
+ def _resolve_basic_type(type_name: str, field_def: Dict[str, Any]) -> Type:
84
+ """Resolve basic JSON schema types to Python types."""
85
+ type_mapping = {
86
+ "string": str,
87
+ "integer": int,
88
+ "number": float,
89
+ "boolean": bool,
90
+ "null": type(None),
91
+ }
92
+
93
+ if type_name in type_mapping:
94
+ return type_mapping[type_name] # type: ignore[return-value]
95
+ elif type_name == "object":
96
+ # Check if it's a nested model or generic dict
97
+ if "properties" in field_def:
98
+ return deserialize_base_model(field_def)
99
+ else:
100
+ return dict
101
+ elif type_name == "array":
102
+ if "items" in field_def:
103
+ inner_type = parse_field(field_def["items"])
104
+ return List[inner_type]
105
+ else:
106
+ return List[Any]
107
+ else:
108
+ raise ValueError(f"Unsupported type: {type_name}")
109
+
110
+
111
+ def parse_field(field_def: Dict[str, Any]) -> Type:
110
112
  """Parse a JSON schema field definition to a Python type.
111
113
 
112
- Converts JSON schema field definitions to corresponding Python types
113
- for use in Pydantic model creation.
114
+ Simplified version with clear separation of concerns.
115
+ """
116
+ # Handle union types
117
+ if "anyOf" in field_def:
118
+ return _resolve_union_type(field_def["anyOf"])
119
+ if "oneOf" in field_def:
120
+ return _resolve_union_type(field_def["oneOf"])
114
121
 
115
- Args:
116
- v (Dict[str, Any]): A dictionary containing the JSON schema field definition.
122
+ # Handle basic types
123
+ if "type" not in field_def:
124
+ return Any # type: ignore[return-value]
117
125
 
118
- Returns:
119
- The corresponding Python type (str, int, float, bool, dict, List, or BaseModel).
126
+ return _resolve_basic_type(field_def["type"], field_def)
120
127
 
121
- Raises:
122
- ValueError: If the field type is not supported.
123
128
 
124
- Example:
125
- ```python
126
- field_def = {"type": "string"}
127
- python_type = parse_field(field_def) # Returns str
129
+ # ============================================================================
130
+ # Field Information Creation - Centralized logic
131
+ # ============================================================================
128
132
 
129
- array_def = {"type": "array", "items": {"type": "integer"}}
130
- python_type = parse_field(array_def) # Returns List[int]
131
- ```
132
- """
133
- t = v["type"]
134
- if t == "string":
135
- return str
136
- elif t == "integer":
137
- return int
138
- elif t == "number":
139
- return float
140
- elif t == "boolean":
141
- return bool
142
- elif t == "object":
143
- # Check if it's a generic object (dict) or a nested model
144
- if "properties" in v:
145
- return deserialize_base_model(v)
146
- else:
147
- return dict
148
- elif t == "array":
149
- inner_type = parse_field(v["items"])
150
- return List[inner_type]
133
+
134
+ def _create_field_info(description: str | None, default_value: Any, is_required: bool) -> Field: # type: ignore[type-arg]
135
+ """Create Field info with consistent logic."""
136
+ if is_required and default_value is None:
137
+ # Required field without default
138
+ return Field(description=description) if description else Field()
151
139
  else:
152
- raise ValueError(f"Unsupported type: {t}")
140
+ # Optional field or field with default
141
+ return Field(default=default_value, description=description) if description else Field(default=default_value)
142
+
143
+
144
+ def _make_optional_if_needed(field_type: Type, is_required: bool, has_default: bool) -> Type:
145
+ """Make field type optional if needed."""
146
+ if is_required or has_default:
147
+ return field_type
148
+
149
+ # Check if already nullable
150
+ if hasattr(field_type, "__origin__") and field_type.__origin__ is Union and type(None) in field_type.__args__:
151
+ return field_type
152
+
153
+ # Make optional
154
+ return Union[field_type, type(None)] # type: ignore[return-value]
155
+
156
+
157
+ # ============================================================================
158
+ # Field Processing - Separated enum and regular field logic
159
+ # ============================================================================
160
+
161
+
162
+ def _process_enum_field(field_name: str, field_def: Dict[str, Any], is_required: bool) -> Tuple[Type, Field]: # type: ignore[type-arg]
163
+ """Process enum field with Literal type."""
164
+ enum_values = field_def["enum"]
165
+
166
+ # Create Literal type
167
+ if len(enum_values) == 1:
168
+ literal_type = Literal[enum_values[0]]
169
+ else:
170
+ literal_type = Literal[tuple(enum_values)]
171
+
172
+ # Handle optionality
173
+ description = field_def.get("description")
174
+ default_value = field_def.get("default")
175
+ has_default = default_value is not None
176
+
177
+ if not is_required and not has_default:
178
+ literal_type = Union[literal_type, type(None)] # type: ignore[assignment]
179
+ default_value = None
180
+
181
+ field_info = _create_field_info(description, default_value, is_required)
182
+ return literal_type, field_info # type: ignore[return-value]
183
+
184
+
185
+ def _process_regular_field(field_name: str, field_def: Dict[str, Any], is_required: bool) -> Tuple[Type, Field]: # type: ignore[type-arg]
186
+ """Process regular (non-enum) field."""
187
+ field_type = parse_field(field_def)
188
+ description = field_def.get("description")
189
+ default_value = field_def.get("default")
190
+ has_default = default_value is not None
191
+
192
+ # Handle optionality
193
+ field_type = _make_optional_if_needed(field_type, is_required, has_default)
194
+
195
+ if not is_required and not has_default:
196
+ default_value = None
197
+
198
+ field_info = _create_field_info(description, default_value, is_required)
199
+ return field_type, field_info
200
+
201
+
202
+ # ============================================================================
203
+ # Main Schema Processing - Clean and focused
204
+ # ============================================================================
153
205
 
154
206
 
155
207
  def deserialize_base_model(json_schema: Dict[str, Any]) -> Type[BaseModel]:
156
208
  """Deserialize a JSON schema to a Pydantic BaseModel class.
157
209
 
158
- Creates a dynamic Pydantic BaseModel class from a JSON schema definition.
159
- For enum fields, this function uses Literal types instead of Enum classes
160
- for better type safety and compatibility with systems like Apache Spark.
161
-
162
- Args:
163
- json_schema (Dict[str, Any]): A dictionary containing the JSON schema definition.
164
-
165
- Returns:
166
- A dynamically created Pydantic BaseModel class.
167
-
168
- Example:
169
- ```python
170
- schema = {
171
- "title": "Person",
172
- "type": "object",
173
- "properties": {
174
- "name": {"type": "string", "description": "Person's name"},
175
- "status": {
176
- "type": "string",
177
- "enum": ["active", "inactive"],
178
- "description": "Person's status"
179
- }
180
- }
181
- }
182
-
183
- PersonModel = deserialize_base_model(schema)
184
- person = PersonModel(name="John", status="active")
185
- ```
186
-
187
- Note:
188
- Enum fields are converted to Literal types for improved compatibility
189
- and type safety. This ensures better integration with data processing
190
- frameworks like Apache Spark.
210
+ Refactored version with clear separation of concerns and simplified logic.
191
211
  """
192
- fields = {}
193
- properties = dereference_json_schema(json_schema).get("properties", {})
194
-
195
- for k, v in properties.items():
196
- if "enum" in v:
197
- enum_values = v["enum"]
212
+ # Basic setup
213
+ title = json_schema.get("title", "DynamicModel")
214
+ dereferenced_schema = dereference_json_schema(json_schema)
215
+ properties = dereferenced_schema.get("properties", {})
216
+ required_fields = set(dereferenced_schema.get("required", []))
198
217
 
199
- # Always use Literal instead of Enum for better type safety and Spark compatibility
200
- if len(enum_values) == 1:
201
- literal_type = Literal[enum_values[0]]
202
- else:
203
- # Create Literal with multiple values
204
- literal_type = Literal[tuple(enum_values)]
205
-
206
- description = v.get("description")
207
- default_value = v.get("default")
208
-
209
- if default_value is not None:
210
- field_info = (
211
- Field(default=default_value, description=description)
212
- if description is not None
213
- else Field(default=default_value)
214
- )
215
- else:
216
- field_info = Field(description=description) if description is not None else Field()
218
+ # Process each field
219
+ fields = {}
220
+ for field_name, field_def in properties.items():
221
+ is_required = field_name in required_fields
217
222
 
218
- fields[k] = (literal_type, field_info)
223
+ if "enum" in field_def:
224
+ field_type, field_info = _process_enum_field(field_name, field_def, is_required)
219
225
  else:
220
- description = v.get("description")
221
- default_value = v.get("default")
222
-
223
- if default_value is not None:
224
- field_info = (
225
- Field(default=default_value, description=description)
226
- if description is not None
227
- else Field(default=default_value)
228
- )
229
- else:
230
- field_info = Field(description=description) if description is not None else Field()
226
+ field_type, field_info = _process_regular_field(field_name, field_def, is_required)
227
+
228
+ fields[field_name] = (field_type, field_info)
231
229
 
232
- fields[k] = (parse_field(v), field_info)
233
- return create_model(json_schema["title"], **fields)
230
+ return create_model(title, **fields)
@@ -79,7 +79,7 @@ __all__ = ["fillna", "FillNaResponse"]
79
79
  def get_examples(df: pd.DataFrame, target_column_name: str, max_examples: int) -> List[Dict]:
80
80
  examples: List[Dict] = []
81
81
 
82
- samples: pd.DataFrame = df.sample(frac=1)
82
+ samples: pd.DataFrame = df.sample(frac=1).reset_index(drop=True).drop_duplicates()
83
83
  samples = samples.dropna(subset=[target_column_name])
84
84
 
85
85
  for i, row in samples.head(max_examples).iterrows():
@@ -109,7 +109,7 @@ def get_instructions(df: pd.DataFrame, target_column_name: str, max_examples: in
109
109
  output_value=json.dumps({"index": row["index"], "output": row["output"]}, ensure_ascii=False),
110
110
  )
111
111
 
112
- return builder.build()
112
+ return builder.improve().build()
113
113
 
114
114
 
115
115
  class FillNaResponse(BaseModel):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 0.14.0
3
+ Version: 0.14.2
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -98,7 +98,7 @@ survey_responses.assign(
98
98
  ).ai.extract("structured") # Auto-expands to columns
99
99
  ```
100
100
 
101
- 📓 **[See more examples →](https://microsoft.github.io/openaivec/examples/)**
101
+ 📓 **[See more examples →](https://microsoft.github.io/openaivec/examples/pandas/)**
102
102
 
103
103
  # Overview
104
104
 
@@ -746,7 +746,7 @@ uv run ruff check . --fix
746
746
  📓 **[Survey data transformation →](https://microsoft.github.io/openaivec/examples/survey_transformation/)** - Unstructured to structured data
747
747
  📓 **[Asynchronous processing examples →](https://microsoft.github.io/openaivec/examples/aio/)** - High-performance async workflows
748
748
  📓 **[Auto-generate FAQs from documents →](https://microsoft.github.io/openaivec/examples/generate_faq/)** - Create FAQs using AI
749
- 📓 **[All examples →](https://microsoft.github.io/openaivec/examples/)** - Complete collection of tutorials and use cases
749
+ 📓 **[All examples →](https://microsoft.github.io/openaivec/examples/pandas/)** - Complete collection of tutorials and use cases
750
750
 
751
751
  ## Community
752
752
 
@@ -8,7 +8,7 @@ openaivec/_prompt.py,sha256=KoJbFK4gTEDRtu9OMweJq_jQLkSPFy2Kcvao30qKhAQ,20844
8
8
  openaivec/_provider.py,sha256=dNr9Y2C97GK-pkY81odurKoDup59dLK31V3EGT2HOwE,6711
9
9
  openaivec/_proxy.py,sha256=giOxRlCCO11XQ0gNVf2IksjZZj9RwvTHkHbmbQXadEk,28916
10
10
  openaivec/_responses.py,sha256=SSa52mCYh2jF52pHaXRtj0tJQUEn52Uy8ypMT57vSks,21428
11
- openaivec/_serialize.py,sha256=mSuSVfSpXMnsFZ4JjlOEe5b22ttBUHviPs9mF99lf0A,7277
11
+ openaivec/_serialize.py,sha256=NLCKl4opc1WS24_duwpI2UGBepQ8SBh4YRxBlLwzDLw,8403
12
12
  openaivec/_util.py,sha256=dFWwjouJyvF-tqNPs2933OAt5Fw9I2Q2BvmGIfGH5k4,6423
13
13
  openaivec/pandas_ext.py,sha256=xOZvuJN6qPQZZmaHKr870kelG2XmuFHkqeAbFNayUZU,58825
14
14
  openaivec/spark.py,sha256=MqzS8nZe6R3wT_QJdopYSMZWDpagN6gyJNh-CAtvHA8,24403
@@ -28,8 +28,8 @@ openaivec/task/nlp/named_entity_recognition.py,sha256=oC6GuvfITlK-izTXoGGpYxVSOr
28
28
  openaivec/task/nlp/sentiment_analysis.py,sha256=BNwWtNT-MNA76eIJbb31641upukmRwM9ajfz8x398gE,3091
29
29
  openaivec/task/nlp/translation.py,sha256=XTZM11JFjbgTK9wHnxFgVDabXZ5bqbabXK_bq2nEkyQ,6627
30
30
  openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
31
- openaivec/task/table/fillna.py,sha256=hQn619rZoed0YJb9dX1uOKxvnfJtWy0yptzGC4yljCw,6585
32
- openaivec-0.14.0.dist-info/METADATA,sha256=R3pr1Ibcqn0MZnEwQqwlqRvss8oS8zas7lWksqUUTSk,27552
33
- openaivec-0.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
34
- openaivec-0.14.0.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
35
- openaivec-0.14.0.dist-info/RECORD,,
31
+ openaivec/task/table/fillna.py,sha256=ZVcOpuh7ULVhrt1VsWy5fPhk53XNaiD7kXGCPhh83M8,6636
32
+ openaivec-0.14.2.dist-info/METADATA,sha256=LfzZLjiWdjJASIdXqIb0Tniou0ZYPYuwjsGtka9sIYg,27566
33
+ openaivec-0.14.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
34
+ openaivec-0.14.2.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
35
+ openaivec-0.14.2.dist-info/RECORD,,