openaivec 0.14.5__py3-none-any.whl → 0.14.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openaivec/_provider.py CHANGED
@@ -145,7 +145,7 @@ CONTAINER.register(tiktoken.Encoding, lambda: tiktoken.get_encoding("o200k_base"
145
145
  CONTAINER.register(TextChunker, lambda: TextChunker(CONTAINER.resolve(tiktoken.Encoding)))
146
146
  CONTAINER.register(
147
147
  SchemaInferer,
148
- lambda SchemaInferer: SchemaInferer(
148
+ lambda: SchemaInferer(
149
149
  client=CONTAINER.resolve(OpenAI),
150
150
  model_name=CONTAINER.resolve(ResponsesModelName).value,
151
151
  ),
openaivec/_schema.py CHANGED
@@ -87,7 +87,12 @@ class FieldSpec(BaseModel):
87
87
  name: str = Field(
88
88
  description=(
89
89
  "Lower snake_case identifier (regex: ^[a-z][a-z0-9_]*$). Must be unique across all fields and "
90
- "express the semantic meaning succinctly (no adjectives like 'best', 'great')."
90
+ "express the semantic meaning succinctly (no adjectives like 'best', 'great'). For numeric (integer|float) "
91
+ "fields the name MUST include an explicit unit or measure suffix (e.g. _count, _total_count, "
92
+ "_duration_seconds, _ms, _price_usd, _ratio, _score) to eliminate ambiguity. Avoid bare numeric nouns like "
93
+ "'duration' or 'value' without unit/scale. Boolean field names MUST begin with 'is_' followed by a "
94
+ "descriptive predicate (e.g. is_active, is_delayed). Use positive forms (is_active) rather than "
95
+ "negated forms (is_not_active)."
91
96
  )
92
97
  )
93
98
  type: Literal["string", "integer", "float", "boolean"] = Field(
@@ -101,7 +106,10 @@ class FieldSpec(BaseModel):
101
106
  description: str = Field(
102
107
  description=(
103
108
  "Concise, objective definition plus extraction rule (what qualifies / what to ignore). Avoid subjective, "
104
- "speculative, or promotional language. If ambiguity exists with another field, clarify the distinction."
109
+ "speculative, or promotional language. If ambiguity exists with another field, clarify the distinction. "
110
+ "Do NOT simply restate an original JSON/key name if the examples are already structured; only include a "
111
+ "raw key verbatim when it is already the minimal, irreducible analytic unit. For derived fields, clearly "
112
+ "state the transformation (e.g. sentiment of comment_text, normalized date, language code)."
105
113
  )
106
114
  )
107
115
  enum_values: Optional[List[str]] = Field(
@@ -312,11 +320,25 @@ Rules:
312
320
  * float = any decimals / ratios
313
321
  * boolean = explicit binary
314
322
  * else use string
323
+ - Numeric (integer|float) field names MUST encode an explicit unit / scale / measure suffix
324
+ (e.g. *_count, *_seconds, *_ms, *_usd, *_ratio, *_score). Avoid ambiguous bare numeric names.
325
+ - Boolean field names MUST start with 'is_' followed by a positive predicate (e.g. is_active,
326
+ is_delayed). Avoid negated forms.
315
327
  - No arrays, objects, composite encodings, or merged multi-concept fields.
316
328
  - Descriptions: concise, objective extraction rules (no marketing/emotion/speculation).
317
329
  - enum_values only for string fields with stable closed vocab; omit otherwise.
318
330
  - Exclude direct outcome labels (e.g. attrition_probability, will_buy, purchase_likelihood)
319
331
  in predictive / feature engineering contexts.
332
+ - When examples already appear as serialized JSON / key-value records, DO NOT merely relist the
333
+ raw original keys unless each is already an atomic, irreducible analytic signal. Prefer high-signal
334
+ derived / normalized / aggregated features (e.g. sentiment, category, language_code, boolean flags,
335
+ normalized_date, count metrics).
336
+ - Superficial renames (adding trivial prefixes/suffixes like _value, _field, new_) are forbidden; a new
337
+ field name must reflect a semantic transformation.
338
+ - Keep field count focused (typically <= 12) prioritizing reusable analytical / ML features over low-signal
339
+ restatements.
340
+ - If you retain an original raw key unchanged, its description must justify why it is minimal and cannot
341
+ be further decomposed without losing analytical value.
320
342
 
321
343
  Output contract:
322
344
  Return exactly an InferredSchema object with JSON keys:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 0.14.5
3
+ Version: 0.14.7
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -5,10 +5,10 @@ openaivec/_log.py,sha256=1qhc9CF4D4bwiF_VWHilcYBPcTqIKyI0zuNEfn0MLNA,1430
5
5
  openaivec/_model.py,sha256=xg3s9Ljqb2xK1t_a5bwWxGJfFSIuaNrFGMgQq4nQKrM,3351
6
6
  openaivec/_optimize.py,sha256=-mKjD5YV_d1Z2nqfGfAcmx6mTKn6AODjFTrIKJPbAXQ,3851
7
7
  openaivec/_prompt.py,sha256=KoJbFK4gTEDRtu9OMweJq_jQLkSPFy2Kcvao30qKhAQ,20844
8
- openaivec/_provider.py,sha256=d7ZjD3Rd2z4g63UwkrKvlw1Z9EcbAItrJiixaay4MCs,7159
8
+ openaivec/_provider.py,sha256=YLrEcb4aWBD1fj0n6PNcJpCtEXK6jkUuRH_WxcLDCuI,7145
9
9
  openaivec/_proxy.py,sha256=J0qGDcZqSab26ScA8OXxzornfwuXtrVycqup-JPq464,29719
10
10
  openaivec/_responses.py,sha256=xtkiOn01RkauHq2FAKRAcjPglH8rmbaSz0-VE0ClTe8,24026
11
- openaivec/_schema.py,sha256=9enwqE2idLLUKbQxjiNn09uhdKz14kihEwUXglRqxx0,20543
11
+ openaivec/_schema.py,sha256=Akvu1AtPoZ5DeFr6Vujx5ScerIiQj6X1rtLvNmPC8dc,22521
12
12
  openaivec/_serialize.py,sha256=NLCKl4opc1WS24_duwpI2UGBepQ8SBh4YRxBlLwzDLw,8403
13
13
  openaivec/_util.py,sha256=dFWwjouJyvF-tqNPs2933OAt5Fw9I2Q2BvmGIfGH5k4,6423
14
14
  openaivec/pandas_ext.py,sha256=xa2DhE6Of8ZwZM3sImG7PSeGvtGkspT-697uHc85R9I,85970
@@ -30,7 +30,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=BNwWtNT-MNA76eIJbb31641upukmRwM9
30
30
  openaivec/task/nlp/translation.py,sha256=XTZM11JFjbgTK9wHnxFgVDabXZ5bqbabXK_bq2nEkyQ,6627
31
31
  openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
32
32
  openaivec/task/table/fillna.py,sha256=ZVcOpuh7ULVhrt1VsWy5fPhk53XNaiD7kXGCPhh83M8,6636
33
- openaivec-0.14.5.dist-info/METADATA,sha256=chAhTTfFnXuZdxKQK5sVEJfOX1wT242b_g-TtHuurao,27566
34
- openaivec-0.14.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
35
- openaivec-0.14.5.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
36
- openaivec-0.14.5.dist-info/RECORD,,
33
+ openaivec-0.14.7.dist-info/METADATA,sha256=y3scYXX1tkfIF18dPR6rA14Oen1ZUfp3PuQXxE1Ylts,27566
34
+ openaivec-0.14.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
35
+ openaivec-0.14.7.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
36
+ openaivec-0.14.7.dist-info/RECORD,,