langfun 0.1.2.dev202511040805__py3-none-any.whl → 0.1.2.dev202511050805__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langfun might be problematic. Click here for more details.

Files changed (76) hide show
  1. langfun/core/agentic/action.py +76 -9
  2. langfun/core/agentic/action_eval.py +9 -2
  3. langfun/core/async_support.py +32 -3
  4. langfun/core/coding/python/correction.py +19 -9
  5. langfun/core/coding/python/execution.py +14 -12
  6. langfun/core/coding/python/generation.py +21 -16
  7. langfun/core/coding/python/sandboxing.py +23 -3
  8. langfun/core/component.py +42 -3
  9. langfun/core/concurrent.py +70 -6
  10. langfun/core/console.py +1 -1
  11. langfun/core/data/conversion/anthropic.py +10 -3
  12. langfun/core/data/conversion/gemini.py +9 -2
  13. langfun/core/data/conversion/openai.py +17 -7
  14. langfun/core/eval/base.py +46 -42
  15. langfun/core/eval/matching.py +5 -2
  16. langfun/core/eval/patching.py +3 -3
  17. langfun/core/eval/scoring.py +4 -3
  18. langfun/core/eval/v2/checkpointing.py +30 -4
  19. langfun/core/eval/v2/evaluation.py +59 -13
  20. langfun/core/eval/v2/example.py +22 -11
  21. langfun/core/eval/v2/experiment.py +51 -8
  22. langfun/core/eval/v2/metric_values.py +23 -3
  23. langfun/core/eval/v2/metrics.py +33 -4
  24. langfun/core/eval/v2/progress.py +9 -1
  25. langfun/core/eval/v2/reporting.py +15 -1
  26. langfun/core/eval/v2/runners.py +27 -7
  27. langfun/core/langfunc.py +45 -130
  28. langfun/core/language_model.py +88 -10
  29. langfun/core/llms/anthropic.py +27 -2
  30. langfun/core/llms/azure_openai.py +29 -17
  31. langfun/core/llms/cache/base.py +22 -2
  32. langfun/core/llms/cache/in_memory.py +48 -7
  33. langfun/core/llms/compositional.py +25 -1
  34. langfun/core/llms/deepseek.py +29 -1
  35. langfun/core/llms/fake.py +32 -1
  36. langfun/core/llms/gemini.py +9 -1
  37. langfun/core/llms/google_genai.py +29 -1
  38. langfun/core/llms/groq.py +27 -2
  39. langfun/core/llms/llama_cpp.py +22 -3
  40. langfun/core/llms/openai.py +29 -1
  41. langfun/core/llms/openai_compatible.py +18 -6
  42. langfun/core/llms/rest.py +12 -1
  43. langfun/core/llms/vertexai.py +39 -6
  44. langfun/core/logging.py +1 -1
  45. langfun/core/mcp/client.py +77 -22
  46. langfun/core/mcp/session.py +90 -10
  47. langfun/core/mcp/tool.py +83 -23
  48. langfun/core/memory.py +1 -0
  49. langfun/core/message.py +59 -12
  50. langfun/core/message_test.py +3 -0
  51. langfun/core/modalities/audio.py +21 -1
  52. langfun/core/modalities/image.py +19 -1
  53. langfun/core/modalities/mime.py +45 -2
  54. langfun/core/modalities/pdf.py +19 -1
  55. langfun/core/modalities/video.py +21 -1
  56. langfun/core/modality.py +66 -5
  57. langfun/core/natural_language.py +1 -1
  58. langfun/core/sampling.py +4 -4
  59. langfun/core/structured/completion.py +32 -37
  60. langfun/core/structured/description.py +54 -50
  61. langfun/core/structured/function_generation.py +29 -12
  62. langfun/core/structured/mapping.py +70 -15
  63. langfun/core/structured/parsing.py +90 -74
  64. langfun/core/structured/querying.py +201 -130
  65. langfun/core/structured/schema.py +70 -10
  66. langfun/core/structured/schema_generation.py +33 -14
  67. langfun/core/structured/scoring.py +45 -34
  68. langfun/core/structured/tokenization.py +24 -9
  69. langfun/core/subscription.py +2 -2
  70. langfun/core/template.py +132 -35
  71. langfun/core/template_test.py +22 -0
  72. {langfun-0.1.2.dev202511040805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/METADATA +1 -1
  73. {langfun-0.1.2.dev202511040805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/RECORD +76 -76
  74. {langfun-0.1.2.dev202511040805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/WHEEL +0 -0
  75. {langfun-0.1.2.dev202511040805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/licenses/LICENSE +0 -0
  76. {langfun-0.1.2.dev202511040805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/top_level.txt +0 -0
@@ -32,17 +32,63 @@ import pyglove as pg
32
32
 
33
33
 
34
34
  class Evaluation(experiment_lib.Experiment):
35
- """Evaluation.
36
-
37
- An evaluation can be a leaf node or a container of other evaluations,
38
- depending on whether the current evaluation object is configured with
39
- any `pg.oneof`.
40
-
41
- For example, `MyEval(lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini1_5Pro()]))`
42
- is a container of two sub-experiments, one for each LLM. In such case, the
43
- evaluation object with `pg.oneof` is called a hyper evaluation, which
44
- represents a search space of evaluations, and each sub-evaluation is called
45
- a leaf evaluation, which will perform the actual evaluation.
35
+ """Base class for Langfun evaluations.
36
+
37
+ `lf.eval.Evaluation` is the base class for defining evaluation tasks in
38
+ Langfun. Users typically subclass it to implement custom evaluation logic by
39
+ overriding `inputs` and `process` methods.
40
+
41
+ An `Evaluation` object encapsulates:
42
+
43
+ * **`inputs`**: A callable that returns an iterable of input examples to be
44
+ processed. This is usually provided by implementing an `inputs(self)`
45
+ method in the subclass, which yields input items for evaluation one by
46
+ one.
47
+ * **`process(self, example)`**: An abstract method that processes one
48
+ example and returns the output, or a tuple of (output, metadata).
49
+ The output will be used for computing metrics.
50
+ * **`metrics`**: A list of metrics (e.g., `lf.metrics.Accuracy`) to compute
51
+ based on the outputs from `process`. Some metrics may require users to
52
+ implement a `ground_truth(self, example)` method in the subclass to
53
+ compute metrics against ground truth.
54
+ * **Hyperparameters**: Any other attributes of the class serve as
55
+ hyperparameters for the evaluation (e.g., the language model to use).
56
+
57
+ **Running Evaluations:**
58
+
59
+ Evaluations are executed via `lf.eval.Suite` or by calling the `.run()`
60
+ method on an `Evaluation` instance, which returns a `Run` object
61
+ containing the evaluation run information and results. If an evaluation
62
+ contains sweeable parameters (using `pg.oneof`), `.run()` will expand it
63
+ into multiple evaluation sub-tasks -- one for each combination of
64
+ hyperparameters -- all managed within the same `Run`.
65
+
66
+ **Example:**
67
+
68
+ ```python
69
+ import langfun as lf
70
+ import pyglove as pg
71
+
72
+ class MyEval(lf.eval.Evaluation):
73
+ lm: lf.LanguageModel
74
+ prompt: str = '1 + 1 = '
75
+
76
+ def inputs(self):
77
+ yield 2
78
+
79
+ def process(self, example: lf.eval.Example):
80
+ return int(lf.query(self.prompt, lm=self.lm))
81
+
82
+ def ground_truth(self, example: lf.eval.Example) -> int:
83
+ return example.input
84
+
85
+ # Run evaluation using two different LMs
86
+ evaluation = MyEval(
87
+ lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini()]),
88
+ metrics=[lf.metrics.Accuracy()]
89
+ )
90
+ run_info = evaluation.run()
91
+ ```
46
92
  """
47
93
 
48
94
  inputs: Annotated[
@@ -137,7 +183,7 @@ class Evaluation(experiment_lib.Experiment):
137
183
 
138
184
  Args:
139
185
  example: An example object to process. `example.input` is an object
140
- returned from `Evaluable.inputs`.
186
+ yielded from `inputs()` method.
141
187
 
142
188
  Returns:
143
189
  A processed output. Or a tuple of (output, metadata).
@@ -760,7 +806,7 @@ class Evaluation(experiment_lib.Experiment):
760
806
 
761
807
 
762
808
  class EvaluationState:
763
- """Evaluation state."""
809
+ """In-memory state of an evaluation."""
764
810
 
765
811
  class ExampleStatus(pg.Object):
766
812
  """Example state."""
@@ -22,19 +22,30 @@ import pyglove as pg
22
22
 
23
23
  @dataclasses.dataclass
24
24
  class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
25
- """An item for the evaluation.
25
+ """An example for evaluation.
26
+
27
+ An evaluation example contains the input and output of an evaluation task,
28
+ as well as metadata about the evaluation process, such as execution time,
29
+ LLM usage, and metric results.
26
30
 
27
31
  Attributes:
28
- id: The 1-based ID of the item in the evaluation set.
29
- input: An element returned from the `Evaluable.inputs` functor.
30
- output: The output of the `process` method. If `pg.MISSING_VALUE`, it has
31
- not been processed yet.
32
- metadata: The metadata of the item produced by the `process` method.
33
- metric_metadata: The dictionary returned from `Metric.audit`.
34
- start_time: The start time of the evaluation item.
35
- end_time: The end time of the evaluation item.
36
- usage_summary: The summary of LLM usages of the evaluation item.
37
- execution_status: The timeit status of the evaluation item.
32
+ id: The 1-based ID of the example in the evaluation set.
33
+ input: An element returned from the `Evaluable.inputs` functor, which serves
34
+ as the input for `lf.Evaluable.process`.
35
+ output: The output of `lf.Evaluable.process` method. If `pg.MISSING_VALUE`,
36
+ it indicates the example has not been processed yet.
37
+ error: The error encountered during `lf.Evaluable.process`. If None, it
38
+ indicates the process was successful.
39
+ metadata: The metadata of the example produced by `lf.Evaluable.process`.
40
+ metric_metadata: The dictionary returned from `Metric.audit`, which contains
41
+ metadata about metric computation for this example.
42
+ newly_processed: Whether this example is processed in the current run. If
43
+ False, it indicates the example was loaded from a checkpoint from previous
44
+ runs.
45
+ start_time: The start time of processing this example.
46
+ end_time: The end time of processing this example.
47
+ usage_summary: The summary of LLM usages for processing this example.
48
+ execution_status: The timeit status of processing this example.
38
49
  """
39
50
  id: int
40
51
  input: Any = pg.MISSING_VALUE
@@ -139,10 +139,10 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
139
139
 
140
140
  # Checkpointing
141
141
 
142
- Experiments support checkpointing, which is enabled by default. It allows
142
+ Experiments support checkpointing, which is enabled by default. It allows
143
143
  users to resume their experiments from a saved state. When an experiment runs,
144
- it creates a new directory for that run and saves the current state to a
145
- checkpoint file. If the experiment is interrupted or fails, users can resume
144
+ it creates a new directory for that run and saves its progress to checkpoint
145
+ files. If the experiment is interrupted or fails, users can resume
146
146
  it by specifying the 'id' or 'warm_start_from' argument (shown above) to
147
147
  seamlessly continue from previously saved state without starting over.
148
148
 
@@ -169,7 +169,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
169
169
 
170
170
  # Experiment Plugins
171
171
 
172
- Experiment can be extended by plugins. Plugins can listen to the events of
172
+ Experiments can be extended by plugins. Plugins can listen to the events of
173
173
  experiment execution and produce additional outputs. For example, a plugin
174
174
  can be added to an experiment to generate additional metrics or to save
175
175
  additional data to a database. More details will be added in the future.
@@ -657,7 +657,30 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
657
657
 
658
658
  @pg.use_init_args(['children'])
659
659
  class Suite(Experiment):
660
- """A suite of evaluations."""
660
+ """A suite of evaluations.
661
+
662
+ `lf.eval.Suite` groups multiple `lf.eval.Evaluation` or other `Suite`
663
+ objects into a single experiment, allowing them to be run, managed, and
664
+ reported together.
665
+
666
+ **Example:**
667
+
668
+ ```python
669
+ import langfun as lf
670
+
671
+ suite = lf.eval.Suite([
672
+ MyEval(lm=lf.llms.Gpt4()),
673
+ MyEval(lm=lf.llms.Gemini()),
674
+ lf.eval.Suite([
675
+ AnotherEval(lm=lf.llms.Gpt4()),
676
+ AnotherEval(lm=lf.llms.Gemini())
677
+ ])
678
+ ])
679
+
680
+ # Run all evaluations in the suite
681
+ run_info = suite.run('/path/to/my/suite_run')
682
+ ```
683
+ """
661
684
 
662
685
  children: Annotated[
663
686
  list[Experiment], 'A list of child experiments.'
@@ -791,7 +814,14 @@ class RunId(pg.Object):
791
814
 
792
815
 
793
816
  class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
794
- """A run of an experiment."""
817
+ """Represents a single run of an experiment.
818
+
819
+ A `Run` object holds all the configurations for executing an experiment,
820
+ such as the experiment definition, input/output directories, and flags
821
+ controlling the execution behavior (e.g., error handling, checkpointing).
822
+ It also provides utility methods for accessing run-specific paths and
823
+ filtering examples for evaluation.
824
+ """
795
825
 
796
826
  root_dir: Annotated[
797
827
  str,
@@ -971,7 +1001,13 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
971
1001
 
972
1002
 
973
1003
  class Runner(pg.Object):
974
- """Interface for experiment runner."""
1004
+ """Interface for experiment runner.
1005
+
1006
+ A runner is responsible for executing the evaluations within an experiment
1007
+ based on the configuration specified in a `Run` object. Different runners
1008
+ can implement different execution strategies, such as sequential or parallel
1009
+ processing of examples and evaluations.
1010
+ """
975
1011
 
976
1012
  # Class-level variable for registering the runner.
977
1013
  NAME = None
@@ -1010,7 +1046,14 @@ class Runner(pg.Object):
1010
1046
 
1011
1047
 
1012
1048
  class Plugin(lf.Component):
1013
- """Base class for experiment plugins."""
1049
+ """Base class for experiment plugins.
1050
+
1051
+ Plugins provide a mechanism to extend the behavior of an experiment run
1052
+ by hooking into various events during the lifecycle of experiment and
1053
+ example execution, such as `on_run_start`, `on_experiment_complete`,
1054
+ `on_example_start`, etc. They can be used for custom logging, monitoring,
1055
+ or result processing.
1056
+ """
1014
1057
 
1015
1058
  def on_run_start(
1016
1059
  self,
@@ -20,7 +20,15 @@ import pyglove as pg
20
20
 
21
21
 
22
22
  class MetricValue(pg.Object):
23
- """Base class for metric values."""
23
+ """Base class for metric values.
24
+
25
+ `MetricValue` is the base class for representing aggregated metric values
26
+ in an evaluation. It accumulates data points from individual examples,
27
+ each consisting of a value and an optional weight, associated with an example
28
+ ID. Subclasses must implement `reduce` method to compute a single float value
29
+ from accumulated data points, and `scalar_repr` to provide a string
30
+ representation of the reduced value.
31
+ """
24
32
 
25
33
  class DataPoint(pg.Object):
26
34
  """A data point for a metric value."""
@@ -133,7 +141,13 @@ class MetricValue(pg.Object):
133
141
 
134
142
 
135
143
  class Rate(MetricValue):
136
- """Representing a rate in range [0, 1]."""
144
+ """Metric value representing a rate in range [0, 1].
145
+
146
+ `Rate` is used for metrics that compute a rate, such as accuracy or error
147
+ rate. The final value is computed as the weighted sum of accumulated values
148
+ divided by the total number of examples. It's displayed as a percentage
149
+ (e.g., 90.0%).
150
+ """
137
151
 
138
152
  def reduce(self) -> float:
139
153
  return self._weighted_sum / self.total
@@ -145,7 +159,13 @@ class Rate(MetricValue):
145
159
 
146
160
 
147
161
  class Average(MetricValue):
148
- """Average of a aggregated values."""
162
+ """Metric value representing an average of accumulated values.
163
+
164
+ `Average` is used for metrics that compute an average score across examples
165
+ (e.g., average quality score). The final value is computed as the weighted
166
+ sum of accumulated values divided by the number of data points.
167
+ It's displayed as a float with 3 decimal places (e.g., 4.750).
168
+ """
149
169
 
150
170
  def reduce(self) -> float:
151
171
  if not self.data_points:
@@ -29,7 +29,15 @@ Average = metric_values.Average
29
29
 
30
30
 
31
31
  class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
32
- """Interface for an evaluation metric."""
32
+ """Interface for an evaluation metric.
33
+
34
+ A metric is used to evaluate the quality of the outputs produced by an
35
+ evaluation. It works by auditing each processed example via its `audit`
36
+ method, which in turn calls the user-overridable `_audit` method to perform
37
+ metric-specific logic and update metric values. Metrics can compute multiple
38
+ values (e.g., precision, recall, F1 score) which are exposed via the
39
+ `values` method.
40
+ """
33
41
 
34
42
  name: Annotated[
35
43
  str,
@@ -169,7 +177,15 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
169
177
 
170
178
 
171
179
  class MetricBase(Metric):
172
- """Base class for common metrics."""
180
+ """Base class for common metrics.
181
+
182
+ `MetricBase` provides common functionalities for metrics, such as automatic
183
+ error counting based on whether an example has an error during evaluation.
184
+ It distinguishes between Object-Oriented Programming (OOP) errors
185
+ (e.g. `MappingError` during structured output generation) and other errors.
186
+ Subclasses should implement `_audit_processed` for metric computation on
187
+ successfully processed examples.
188
+ """
173
189
 
174
190
  oop_errors: Rate | None = Rate()
175
191
  non_oop_errors: Rate | None = Rate()
@@ -229,7 +245,13 @@ class MetricBase(Metric):
229
245
 
230
246
 
231
247
  class Match(MetricBase):
232
- """Metric for matching outputs against groundtruth."""
248
+ """Metric for matching outputs against ground truth.
249
+
250
+ This metric computes match and mismatch rates by comparing the output of
251
+ an example with its ground truth. By default, it looks for a `groundtruth`
252
+ attribute in `example.input` for comparison. Users can customize this behavior
253
+ by subclassing `Match` and overriding the `match` method.
254
+ """
233
255
 
234
256
  name = 'match'
235
257
  matches: Rate = Rate()
@@ -302,7 +324,14 @@ class Match(MetricBase):
302
324
 
303
325
 
304
326
  class Score(MetricBase):
305
- """Base class for scoring."""
327
+ """Base class for scoring metrics.
328
+
329
+ `Score` is a base class for metrics that assign a numerical score to each
330
+ example's output (e.g., evaluating quality on a scale of 1-5).
331
+ It automatically computes the average score across all examples.
332
+ Subclasses must implement the `score` method to define how an example
333
+ should be scored.
334
+ """
306
335
 
307
336
  name = 'score'
308
337
  average_score: Average = Average()
@@ -21,7 +21,15 @@ import pyglove as pg
21
21
 
22
22
 
23
23
  class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
24
- """Evaluation progress."""
24
+ """Represents and tracks the progress of an evaluation.
25
+
26
+ The `Progress` class maintains counts of processed, failed, and skipped
27
+ items in an evaluation, along with timing information (start time, stop time,
28
+ duration) and an execution summary. It provides properties to check the
29
+ status of the evaluation (e.g., `is_started`, `is_completed`) and methods
30
+ to update progress as items are evaluated.
31
+ It also supports HTML rendering as a progress bar for visualization.
32
+ """
25
33
 
26
34
  num_total: Annotated[
27
35
  int | None,
@@ -33,7 +33,21 @@ _EVALULATION_DETAIL_FILE = 'index.html'
33
33
 
34
34
 
35
35
  class HtmlReporter(experiment_lib.Plugin):
36
- """Plugin for periodically generating HTML reports for the experiment."""
36
+ """Plugin for periodically generating HTML reports for the experiment.
37
+
38
+ The `HtmlReporter` plugin generates several HTML files during an experiment
39
+ run:
40
+ - A `summary.html` at the root of the run directory, summarizing all
41
+ evaluations in the experiment.
42
+ - An `index.html` for each leaf evaluation, detailing the evaluation
43
+ definition, metrics, and logs.
44
+ - An HTML file for each example (e.g., `1.html`, `2.html`, ...) within
45
+ each leaf evaluation's directory, showing the input, output, metadata,
46
+ and any errors for that example.
47
+
48
+ These reports are updated periodically in the background during the run,
49
+ allowing users to monitor progress in near real-time.
50
+ """
37
51
 
38
52
  summary_interval: Annotated[
39
53
  int,
@@ -42,7 +42,14 @@ _RUN_MANIFEST = 'run.json'
42
42
 
43
43
 
44
44
  class RunnerBase(Runner):
45
- """A simple runner that runs evaluations and their examples sequentially."""
45
+ """Base class for runners with plugin support and IO pooling.
46
+
47
+ `RunnerBase` provides the basic runner functionalities such as plugin
48
+ integration for checkpointing, reporting and progress tracking.
49
+ It also manages a thread pool for background IO operations.
50
+ Subclasses should implement `_run` and `_evaluate_items` for different
51
+ execution strategies.
52
+ """
46
53
 
47
54
  tqdm: Annotated[
48
55
  bool,
@@ -397,11 +404,12 @@ class RunnerBase(Runner):
397
404
 
398
405
 
399
406
  class SequentialRunner(RunnerBase):
400
- """Sequential runner.
407
+ """A runner that executes evaluations and examples sequentially.
401
408
 
402
- Sequential runner runs all evaluations and their examples in sequence,
403
- as well as the background tasks, it allows the developer to catch all
404
- exceptions thrown from the background tasks, making it easier to debug.
409
+ The sequential runner executes all evaluations and their examples in the
410
+ calling thread. Background tasks are also run sequentially, which makes it
411
+ easier to debug as exceptions from background tasks will be raised
412
+ immediately.
405
413
  """
406
414
 
407
415
  NAME = 'sequential'
@@ -426,7 +434,13 @@ class SequentialRunner(RunnerBase):
426
434
 
427
435
 
428
436
  class DebugRunner(SequentialRunner):
429
- """Debug runner."""
437
+ """A runner for debugging evaluations.
438
+
439
+ The debug runner is a sequential runner that only runs the first example
440
+ of each evaluation, with `raise_if_has_error` enabled. This is useful for
441
+ quickly identifying issues in evaluation logic during development.
442
+ Checkpointers are disabled for this runner.
443
+ """
430
444
 
431
445
  NAME = 'debug'
432
446
 
@@ -444,7 +458,13 @@ class DebugRunner(SequentialRunner):
444
458
 
445
459
 
446
460
  class ParallelRunner(RunnerBase):
447
- """Parallel runner."""
461
+ """A runner that executes evaluations and examples in parallel.
462
+
463
+ The parallel runner groups evaluations by their required resources
464
+ (e.g., specific LLMs) and runs evaluations that do not share resources in
465
+ parallel. Within each evaluation, examples are also processed in parallel
466
+ using threads, up to `Evaluation.max_workers`.
467
+ """
448
468
 
449
469
  NAME = 'parallel'
450
470
 
langfun/core/langfunc.py CHANGED
@@ -32,146 +32,43 @@ _TLS_LFUN_CALL_STACK = '_langfunc_callstack'
32
32
  # NOTE(daiyip): Only the template string belongs to the positional arguments,
33
33
  # all others are keyword-only for clarity.
34
34
  @pg.use_init_args(['template_str'])
35
- class LangFunc(
36
- template_lib.Template,
37
- ):
38
- r"""Base class for natural-language driven component.
39
-
40
- ``LangFunc`` is a language-driven component that enables users to
41
- seamlessly interact with Language Models (LLMs) using a blend of natural
42
- language and code. It empowers users to easily modularize prompt/execution
43
- logics, compose them, and simplify the creation of Language Model (LLM)-based
44
- components and applications.
45
-
46
- LangFunc can be conceptualized as a string template with embeddable code,
47
- but it distinguishes itself from traditional template systems in four key
48
- ways.
49
-
50
- Firstly, it enables easy modularization of templates along with the required
51
- values with OO principles, providing a reusable way for LLM-based content
52
- generation. For example:
53
-
54
- ```
55
- class FewshotExamples(lf.LangFunc):
56
- '''Base for fewshot prompt.
57
-
58
- {% for example in examples %}
59
- {{ example }}
60
- {% endfor %}
61
- '''
62
-
63
- # Usage 1: __init__ time binding.
64
- assert FewshotPrompt(examples=['foo', 'bar'])() == 'foo\nbar'
65
-
66
- # Usage 2: __call__ time binding.
67
- assert FewshotPrompt()(examples=['foo', 'bar']) == 'foo\nbar'
68
-
69
- class ToolDescription(lf.LangFunc):
70
- '''Tool descriptions.
71
-
72
- {% for tool in tools %}
73
- {{ tool.description }}
74
- {% endfor %}
75
- '''
76
- # We want to constrain tools to be a list of `Tool` objects.
77
- tools: list[Tool]
78
-
79
- # Raises: runtime type checking will fail on [1, 2, 3].
80
- ToolDescription(tools=[1, 2, 3])
81
- ```
82
-
83
- Secondly, it has the capability to compose multiple LangFuncs together,
84
- enabling the accomplishment of complex language tasks with maximum reuse.
85
- It allows users to provide program inputs to all the LangFuncs within a
86
- composition at the top level, significantly simplifying the process of
87
- providing context for users. For example:
88
-
89
- ```
90
- class ReAct(lf.LangFunc):
91
- '''ReAct prompt for tool-use.
92
-
93
- {{ preamble }}
94
- {{ tool_description }}
95
- {{ tool_examples }}
96
- {{ user_input }}
97
- '''
98
- # Default preamble, which could be overriden from subclass
99
- # or parsed from the `__init__` argument.
100
- preamble = 'Please help me on my task based on the following tools.',
101
-
102
- react = ReAct(
103
- tool_description=ToolDescription()
104
- tool_examples=FewshotExamples(),
105
- # Partially bind `tools` and `examples`.
106
- tools=my_tools,
107
- examples=[t.examples for t in my_tools]
108
- )
109
-
110
- # Late bind `user_input` at __call__ time.
111
- react(user_input='Help me get a lunch to go, veggie please.' )
112
- ```
113
-
114
- Thirdly, it allows the flexibility to encapsulate complex compositions to
115
- reusable classes and modify them. For example:
116
-
117
- ```
118
- # The compound decorator converts a function into a LangFunc.
119
- @lf.compound
120
- def react_with_tools(preamble, tools: list[Tool]):
121
- return ReAct(
122
- preamble=preamble,
123
- tool_description=ToolDescription()
124
- tool_examples=FewshotExamples(),
125
- # Partially bind `tools` and `examples`.
126
- tools=my_tools,
127
- examples=[t.examples for t in my_tools]
128
- )
35
+ class LangFunc(template_lib.Template):
36
+ r"""Base class for Language-based functions.
129
37
 
130
- # Actually, the entire chat application is a LangFunc.
131
- class Chat(lt.LangFunc):
132
- '''LLM-based Chat application.
38
+ LangFunc represents a function powered by a language model. It is a subclass
39
+ of `lf.Template` and can be thought of as a `lf.Template` augmented with an LM
40
+ and an output transformation. Calling a `lf.LangFunc` is equivalent to calling
41
+ the LM with the rendered prompt and transforming the output.
133
42
 
134
- llm({{ prompt }})
135
- '''
43
+ LangFunc can be directly constructed and used.
136
44
 
137
- chat = Chat(
138
- llm=Bard24B(),
139
- prompt=react_with_tools(
140
- preamble=(
141
- f'Please help me solve my problem using tools. '
142
- f'Current time is {{datetime.datetime.now()}}'),
143
- tools=my_tools))
45
+ ```python
46
+ import langfun as lf
144
47
 
145
- chat(user_input='Help me get a lunch to go, veggie please.')
146
- ```
48
+ func = lf.LangFunc("Hello, {{name}}!")
49
+ print(func(name="Gemini", lm=lf.llms.Gemini25Flash()))
50
+ # Output: Hello, how are you today?
51
+ ```
147
52
 
148
- Fourthly, LangFunc is built on top of PyGlove symbolic programming power,
149
- it could be manipulated programmatically, turned into a space for data
150
- sampling, or even tuned by AutoML. For example:
53
+ Or it can be subclassed:
151
54
 
152
- ```
153
- import pyglove as pg
55
+ ```python
56
+ import langfun as lf
154
57
 
155
- prompt_space = react_with_tools(
156
- preamble=pg.oneof([
157
- 'Help me solve my problem using the following tools:',
158
- 'Help me with the tools below:',
159
- ...
160
- ])
161
- # Choose any two of the tools for generating data.
162
- tools=pg.manyof(2, [
163
- google_search(...),
164
- doordash(...),
165
- ...
166
- ])
58
+ class Compute(lf.LangFunc):
59
+ '''Compute a simple arithmetic expression.
167
60
 
168
- for prompt in pg.random_sample(prompt_space):
169
- print(prompt(user_input='Help me book a conf room please.'))
61
+ {{expression}} = ?
62
+ '''
63
+ expression: str
170
64
 
171
- ```
65
+ def transform_output(self, lm_output: lf.Message) -> lf.Message:
66
+ lm_output.metadata.result = float(lm_output.text)
67
+ return lm_output
172
68
 
173
- For more capabilities on symbolic programming with PyGlove, please checkout
174
- https://pyglove.readthedocs.io/en/latest/.
69
+ r = Compute(expression="1 + 1")(lm=lf.llms.Gemini25Flash())
70
+ print(r.result)
71
+ # Output: 2.0
175
72
 
176
73
  Final note: always include these capitalized words if you don't want to treat
177
74
  the docstr as the template str: THIS IS NOT A TEMPLATE. So as a result, this
@@ -305,6 +202,24 @@ class LangFunc(
305
202
  message_cls: Type[message_lib.Message] = message_lib.UserMessage,
306
203
  **kwargs,
307
204
  ) -> message_lib.Message:
205
+ """Renders the template and transforms it as LM input message.
206
+
207
+ Args:
208
+ allow_partial: If True, allows partial rendering, which leaves unresolved
209
+ variables in place in the output text. Otherwise, raises error when
210
+ there are unresolved variables.
211
+ implicit: If True, reuse the rendering output if a parent `lf.Template`
212
+ is rendering current `lf.Template` multiple times. This is important
213
+ for making sure all references to the same `lf.Template` within a single
214
+ top-level rendering would return the same result. If False, every call
215
+ to `render` will trigger the actual rendering process.
216
+ message_cls: The message class used for creating the return value.
217
+ **kwargs: Values for template variables, which override values from
218
+ member attributes or context.
219
+
220
+ Returns:
221
+ A Message object containing the rendered result.
222
+ """
308
223
  lm_input = super().render(
309
224
  allow_partial=allow_partial,
310
225
  implicit=implicit,