openaivec 0.14.3__tar.gz → 0.14.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {openaivec-0.14.3 → openaivec-0.14.5}/PKG-INFO +1 -1
  2. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_provider.py +15 -0
  3. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_proxy.py +24 -2
  4. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_schema.py +47 -6
  5. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/pandas_ext.py +844 -329
  6. openaivec-0.14.5/tests/test_schema.py +371 -0
  7. openaivec-0.14.3/tests/test_schema.py +0 -103
  8. {openaivec-0.14.3 → openaivec-0.14.5}/.env.example +0 -0
  9. {openaivec-0.14.3 → openaivec-0.14.5}/.github/copilot-instructions.md +0 -0
  10. {openaivec-0.14.3 → openaivec-0.14.5}/.github/workflows/python-mkdocs.yml +0 -0
  11. {openaivec-0.14.3 → openaivec-0.14.5}/.github/workflows/python-package.yml +0 -0
  12. {openaivec-0.14.3 → openaivec-0.14.5}/.github/workflows/python-test.yml +0 -0
  13. {openaivec-0.14.3 → openaivec-0.14.5}/.github/workflows/python-update.yml +0 -0
  14. {openaivec-0.14.3 → openaivec-0.14.5}/.gitignore +0 -0
  15. {openaivec-0.14.3 → openaivec-0.14.5}/CODE_OF_CONDUCT.md +0 -0
  16. {openaivec-0.14.3 → openaivec-0.14.5}/LICENSE +0 -0
  17. {openaivec-0.14.3 → openaivec-0.14.5}/README.md +0 -0
  18. {openaivec-0.14.3 → openaivec-0.14.5}/SECURITY.md +0 -0
  19. {openaivec-0.14.3 → openaivec-0.14.5}/SUPPORT.md +0 -0
  20. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/main.md +0 -0
  21. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/pandas_ext.md +0 -0
  22. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/spark.md +0 -0
  23. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/task.md +0 -0
  24. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/customer_support/customer_sentiment.md +0 -0
  25. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/customer_support/inquiry_classification.md +0 -0
  26. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/customer_support/inquiry_summary.md +0 -0
  27. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/customer_support/intent_analysis.md +0 -0
  28. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/customer_support/response_suggestion.md +0 -0
  29. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/customer_support/urgency_analysis.md +0 -0
  30. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/nlp/dependency_parsing.md +0 -0
  31. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/nlp/keyword_extraction.md +0 -0
  32. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/nlp/morphological_analysis.md +0 -0
  33. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/nlp/named_entity_recognition.md +0 -0
  34. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/nlp/sentiment_analysis.md +0 -0
  35. {openaivec-0.14.3 → openaivec-0.14.5}/docs/api/tasks/nlp/translation.md +0 -0
  36. {openaivec-0.14.3 → openaivec-0.14.5}/docs/index.md +0 -0
  37. {openaivec-0.14.3 → openaivec-0.14.5}/docs/robots.txt +0 -0
  38. {openaivec-0.14.3 → openaivec-0.14.5}/mkdocs.yml +0 -0
  39. {openaivec-0.14.3 → openaivec-0.14.5}/pyproject.toml +0 -0
  40. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/__init__.py +0 -0
  41. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_di.py +0 -0
  42. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_embeddings.py +0 -0
  43. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_log.py +0 -0
  44. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_model.py +0 -0
  45. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_optimize.py +0 -0
  46. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_prompt.py +0 -0
  47. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_responses.py +0 -0
  48. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_serialize.py +0 -0
  49. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/_util.py +0 -0
  50. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/spark.py +0 -0
  51. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/__init__.py +0 -0
  52. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/customer_support/__init__.py +0 -0
  53. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/customer_support/customer_sentiment.py +0 -0
  54. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/customer_support/inquiry_classification.py +0 -0
  55. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/customer_support/inquiry_summary.py +0 -0
  56. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/customer_support/intent_analysis.py +0 -0
  57. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/customer_support/response_suggestion.py +0 -0
  58. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/customer_support/urgency_analysis.py +0 -0
  59. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/nlp/__init__.py +0 -0
  60. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/nlp/dependency_parsing.py +0 -0
  61. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/nlp/keyword_extraction.py +0 -0
  62. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/nlp/morphological_analysis.py +0 -0
  63. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/nlp/named_entity_recognition.py +0 -0
  64. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/nlp/sentiment_analysis.py +0 -0
  65. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/nlp/translation.py +0 -0
  66. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/table/__init__.py +0 -0
  67. {openaivec-0.14.3 → openaivec-0.14.5}/src/openaivec/task/table/fillna.py +0 -0
  68. {openaivec-0.14.3 → openaivec-0.14.5}/tests/__init__.py +0 -0
  69. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_di.py +0 -0
  70. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_embeddings.py +0 -0
  71. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_optimize.py +0 -0
  72. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_pandas_ext.py +0 -0
  73. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_prompt.py +0 -0
  74. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_provider.py +0 -0
  75. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_proxy.py +0 -0
  76. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_proxy_suggester.py +0 -0
  77. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_responses.py +0 -0
  78. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_serialize.py +0 -0
  79. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_serialize_pydantic_v2_compliance.py +0 -0
  80. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_spark.py +0 -0
  81. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_task.py +0 -0
  82. {openaivec-0.14.3 → openaivec-0.14.5}/tests/test_util.py +0 -0
  83. {openaivec-0.14.3 → openaivec-0.14.5}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 0.14.3
3
+ Version: 0.14.5
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -13,6 +13,7 @@ from openaivec._model import (
13
13
  OpenAIAPIKey,
14
14
  ResponsesModelName,
15
15
  )
16
+ from openaivec._schema import SchemaInferer
16
17
  from openaivec._util import TextChunker
17
18
 
18
19
  __all__ = []
@@ -142,6 +143,13 @@ CONTAINER.register(OpenAI, provide_openai_client)
142
143
  CONTAINER.register(AsyncOpenAI, provide_async_openai_client)
143
144
  CONTAINER.register(tiktoken.Encoding, lambda: tiktoken.get_encoding("o200k_base"))
144
145
  CONTAINER.register(TextChunker, lambda: TextChunker(CONTAINER.resolve(tiktoken.Encoding)))
146
+ CONTAINER.register(
147
+ SchemaInferer,
148
+ lambda SchemaInferer: SchemaInferer(
149
+ client=CONTAINER.resolve(OpenAI),
150
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
151
+ ),
152
+ )
145
153
 
146
154
 
147
155
  def reset_environment_registrations():
@@ -160,3 +168,10 @@ def reset_environment_registrations():
160
168
  )
161
169
  CONTAINER.register(OpenAI, provide_openai_client)
162
170
  CONTAINER.register(AsyncOpenAI, provide_async_openai_client)
171
+ CONTAINER.register(
172
+ SchemaInferer,
173
+ lambda: SchemaInferer(
174
+ client=CONTAINER.resolve(OpenAI),
175
+ model_name=CONTAINER.resolve(ResponsesModelName).value,
176
+ ),
177
+ )
@@ -460,7 +460,20 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
460
460
  self.__process_owned(owned, map_func)
461
461
  self.__wait_for(wait_for, map_func)
462
462
 
463
- return self.__values(items)
463
+ # Fetch results before purging None entries
464
+ results = self.__values(items)
465
+
466
+ # Remove None values from cache so they are recomputed on future calls
467
+ with self._lock:
468
+ if self._cache: # micro-optimization
469
+ for k in set(items):
470
+ try:
471
+ if self._cache.get(k, object()) is None:
472
+ del self._cache[k]
473
+ except KeyError:
474
+ pass
475
+
476
+ return results
464
477
 
465
478
 
466
479
  @dataclass
@@ -745,4 +758,13 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
745
758
  await self.__process_owned(owned, map_func)
746
759
  await self.__wait_for(wait_for, map_func)
747
760
 
748
- return await self.__values(items)
761
+ results = await self.__values(items)
762
+
763
+ # Remove None values from cache after retrieval to avoid persisting incomplete results
764
+ async with self._lock:
765
+ if self._cache:
766
+ for k in set(items):
767
+ if self._cache.get(k, object()) is None:
768
+ self._cache.pop(k, None)
769
+
770
+ return results
@@ -128,6 +128,12 @@ class InferredSchema(BaseModel):
128
128
  redundancy removed).
129
129
  examples_summary: Neutral description of structural / semantic patterns
130
130
  observed in the examples (domain, recurring signals, constraints).
131
+ examples_purpose_alignment: Analytical explanation of how the concrete
132
+ recurring patterns in the provided examples *justify*, *constrain*,
133
+ or *refine* the stated purpose. Should map purpose facets to
134
+ observed evidence (or explicitly note gaps) to discourage
135
+ hallucinated fields and anchor extraction scope. This is an
136
+ internal quality aid – downstream consumers typically ignore it.
131
137
  fields: Ordered list of ``FieldSpec`` objects comprising the schema's
132
138
  sole authoritative contract.
133
139
  inference_prompt: Self-contained extraction instructions enforcing an
@@ -147,6 +153,13 @@ class InferredSchema(BaseModel):
147
153
  "patterns, and notable constraints."
148
154
  )
149
155
  )
156
+ examples_purpose_alignment: str = Field(
157
+ description=(
158
+ "Explanation of how observable recurring patterns in the examples substantiate and bound the stated "
159
+ "purpose. Should reference purpose facets and cite supporting example evidence (or note any gaps) to "
160
+ "reduce hallucinated fields. Internal diagnostic / quality aid; not required for downstream extraction."
161
+ )
162
+ )
150
163
  fields: List[FieldSpec] = Field(
151
164
  description=(
152
165
  "Ordered list of proposed fields derived strictly from observable, repeatable signals in the "
@@ -234,7 +247,7 @@ class InferredSchema(BaseModel):
234
247
  py_type = enum_cls
235
248
  else:
236
249
  py_type = type_map[spec.type]
237
- fields[spec.name] = (py_type, ...)
250
+ fields[spec.name] = (py_type, Field(description=spec.description))
238
251
 
239
252
  model = create_model("InferredSchema", **fields) # type: ignore[call-arg]
240
253
  return model
@@ -281,11 +294,15 @@ You are a schema inference engine.
281
294
  Task:
282
295
  1. Normalize the user's purpose (eliminate ambiguity, redundancy, contradictions).
283
296
  2. Objectively summarize observable patterns in the example texts.
284
- 3. Propose a minimal flat set of scalar fields (no nesting / arrays) that are reliably extractable.
285
- 4. Skip fields likely missing in a large share (>~20%) of realistic inputs.
286
- 5. Provide enum_values ONLY when a small stable closed categorical set (2–24 lowercase tokens)
297
+ 3. Produce an "examples_purpose_alignment" explanation that explicitly maps purpose facets
298
+ to concrete recurring evidence in the examples (or flags gaps). Use concise bullet‑style
299
+ sentences (still a plain string) such as: "purpose facet -> supporting pattern / gap".
300
+ This MUST NOT introduce new domain facts beyond the examples & purpose.
301
+ 4. Propose a minimal flat set of scalar fields (no nesting / arrays) that are reliably extractable.
302
+ 5. Skip fields likely missing in a large share (>~20%) of realistic inputs.
303
+ 6. Provide enum_values ONLY when a small stable closed categorical set (2–24 lowercase tokens)
287
304
  is clearly evidenced; never invent.
288
- 6. If the purpose indicates prediction (predict / probability / likelihood), output only
305
+ 7. If the purpose indicates prediction (predict / probability / likelihood), output only
289
306
  explanatory features (no target restatement).
290
307
 
291
308
  Rules:
@@ -305,6 +322,7 @@ Output contract:
305
322
  Return exactly an InferredSchema object with JSON keys:
306
323
  - purpose (string)
307
324
  - examples_summary (string)
325
+ - examples_purpose_alignment (string)
308
326
  - fields (array of FieldSpec objects: name, type, description, enum_values?)
309
327
  - inference_prompt (string)
310
328
  """.strip()
@@ -359,10 +377,31 @@ class SchemaInferer:
359
377
  raise ValueError("max_retries must be >= 1")
360
378
 
361
379
  last_err: Exception | None = None
380
+ previous_errors: list[str] = []
362
381
  for attempt in range(max_retries):
382
+ if attempt == 0:
383
+ instructions = _INFER_INSTRUCTIONS
384
+ else:
385
+ # Provide structured feedback for correction. Keep concise and prohibit speculative expansion.
386
+ feedback_lines = [
387
+ "--- PRIOR VALIDATION FEEDBACK ---",
388
+ ]
389
+ for i, err in enumerate(previous_errors[-5:], 1): # include last up to 5 errors
390
+ feedback_lines.append(f"{i}. {err}")
391
+ feedback_lines.extend(
392
+ [
393
+ "Adjust ONLY listed issues; avoid adding brand-new fields unless essential.",
394
+ "Don't hallucinate or broaden enum_values unless enum rule caused failure.",
395
+ "Duplicate names: minimally rename; keep semantics.",
396
+ "Unsupported type: change to string|integer|float|boolean (no new facts).",
397
+ "Bad enum length: drop enum or constrain to 2–24 evidenced tokens.",
398
+ ]
399
+ )
400
+ instructions = _INFER_INSTRUCTIONS + "\n\n" + "\n".join(feedback_lines)
401
+
363
402
  response: ParsedResponse[InferredSchema] = self.client.responses.parse(
364
403
  model=self.model_name,
365
- instructions=_INFER_INSTRUCTIONS,
404
+ instructions=instructions,
366
405
  input=data.model_dump_json(),
367
406
  text_format=InferredSchema,
368
407
  *args,
@@ -371,8 +410,10 @@ class SchemaInferer:
371
410
  parsed = response.output_parsed
372
411
  try:
373
412
  _basic_field_list_validation(parsed)
413
+ parsed.build_model() # ensure dynamic model creation succeeds
374
414
  except ValueError as e:
375
415
  last_err = e
416
+ previous_errors.append(str(e))
376
417
  if attempt == max_retries - 1:
377
418
  raise
378
419
  continue