rubyllm-observ 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (209) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +778 -0
  3. data/Rakefile +49 -0
  4. data/app/assets/javascripts/observ/application.js +12 -0
  5. data/app/assets/javascripts/observ/controllers/autoscroll_controller.js +33 -0
  6. data/app/assets/javascripts/observ/controllers/chat_form_controller.js +93 -0
  7. data/app/assets/javascripts/observ/controllers/copy_controller.js +43 -0
  8. data/app/assets/javascripts/observ/controllers/dashboard_controller.js +58 -0
  9. data/app/assets/javascripts/observ/controllers/drawer_controller.js +58 -0
  10. data/app/assets/javascripts/observ/controllers/expandable_controller.js +33 -0
  11. data/app/assets/javascripts/observ/controllers/filter_controller.js +36 -0
  12. data/app/assets/javascripts/observ/controllers/index.js +52 -0
  13. data/app/assets/javascripts/observ/controllers/json_viewer_controller.js +260 -0
  14. data/app/assets/javascripts/observ/controllers/message_form_controller.js +58 -0
  15. data/app/assets/javascripts/observ/controllers/prompt_variables_controller.js +64 -0
  16. data/app/assets/javascripts/observ/controllers/text_select_controller.js +14 -0
  17. data/app/assets/stylesheets/observ/_annotations.scss +127 -0
  18. data/app/assets/stylesheets/observ/_card.scss +52 -0
  19. data/app/assets/stylesheets/observ/_chat.scss +156 -0
  20. data/app/assets/stylesheets/observ/_components.scss +460 -0
  21. data/app/assets/stylesheets/observ/_dashboard.scss +40 -0
  22. data/app/assets/stylesheets/observ/_datasets.scss +697 -0
  23. data/app/assets/stylesheets/observ/_drawer.scss +273 -0
  24. data/app/assets/stylesheets/observ/_json_viewer.scss +120 -0
  25. data/app/assets/stylesheets/observ/_layout.scss +256 -0
  26. data/app/assets/stylesheets/observ/_metrics.scss +99 -0
  27. data/app/assets/stylesheets/observ/_observations.scss +160 -0
  28. data/app/assets/stylesheets/observ/_pagination.scss +143 -0
  29. data/app/assets/stylesheets/observ/_prompts.scss +365 -0
  30. data/app/assets/stylesheets/observ/_table.scss +53 -0
  31. data/app/assets/stylesheets/observ/_variables.scss +53 -0
  32. data/app/assets/stylesheets/observ/application.scss +15 -0
  33. data/app/controllers/observ/annotations_controller.rb +144 -0
  34. data/app/controllers/observ/application_controller.rb +8 -0
  35. data/app/controllers/observ/chats_controller.rb +58 -0
  36. data/app/controllers/observ/dashboard_controller.rb +159 -0
  37. data/app/controllers/observ/dataset_items_controller.rb +85 -0
  38. data/app/controllers/observ/dataset_run_items_controller.rb +84 -0
  39. data/app/controllers/observ/dataset_runs_controller.rb +110 -0
  40. data/app/controllers/observ/datasets_controller.rb +74 -0
  41. data/app/controllers/observ/messages_controller.rb +26 -0
  42. data/app/controllers/observ/observations_controller.rb +59 -0
  43. data/app/controllers/observ/prompt_versions_controller.rb +148 -0
  44. data/app/controllers/observ/prompts_controller.rb +205 -0
  45. data/app/controllers/observ/sessions_controller.rb +45 -0
  46. data/app/controllers/observ/traces_controller.rb +86 -0
  47. data/app/forms/observ/prompt_form.rb +96 -0
  48. data/app/helpers/observ/application_helper.rb +9 -0
  49. data/app/helpers/observ/chats_helper.rb +47 -0
  50. data/app/helpers/observ/dashboard_helper.rb +154 -0
  51. data/app/helpers/observ/datasets_helper.rb +62 -0
  52. data/app/helpers/observ/pagination_helper.rb +38 -0
  53. data/app/jobs/observ/application_job.rb +4 -0
  54. data/app/jobs/observ/dataset_runner_job.rb +49 -0
  55. data/app/mailers/observ/application_mailer.rb +6 -0
  56. data/app/models/concerns/observ/agent_phaseable.rb +124 -0
  57. data/app/models/concerns/observ/agent_selectable.rb +50 -0
  58. data/app/models/concerns/observ/chat_enhancements.rb +109 -0
  59. data/app/models/concerns/observ/message_enhancements.rb +31 -0
  60. data/app/models/concerns/observ/observability_instrumentation.rb +124 -0
  61. data/app/models/concerns/observ/prompt_management.rb +320 -0
  62. data/app/models/concerns/observ/trace_association.rb +9 -0
  63. data/app/models/observ/annotation.rb +23 -0
  64. data/app/models/observ/application_record.rb +5 -0
  65. data/app/models/observ/dataset.rb +51 -0
  66. data/app/models/observ/dataset_item.rb +41 -0
  67. data/app/models/observ/dataset_run.rb +104 -0
  68. data/app/models/observ/dataset_run_item.rb +111 -0
  69. data/app/models/observ/generation.rb +56 -0
  70. data/app/models/observ/null_prompt.rb +59 -0
  71. data/app/models/observ/observation.rb +38 -0
  72. data/app/models/observ/prompt.rb +315 -0
  73. data/app/models/observ/score.rb +51 -0
  74. data/app/models/observ/session.rb +131 -0
  75. data/app/models/observ/span.rb +13 -0
  76. data/app/models/observ/trace.rb +135 -0
  77. data/app/presenters/observ/agent_select_presenter.rb +59 -0
  78. data/app/services/observ/agent_executor_service.rb +174 -0
  79. data/app/services/observ/agent_provider.rb +60 -0
  80. data/app/services/observ/agent_selection_service.rb +53 -0
  81. data/app/services/observ/chat_instrumenter.rb +523 -0
  82. data/app/services/observ/dataset_runner_service.rb +153 -0
  83. data/app/services/observ/evaluator_runner_service.rb +58 -0
  84. data/app/services/observ/evaluators/base_evaluator.rb +51 -0
  85. data/app/services/observ/evaluators/contains_evaluator.rb +53 -0
  86. data/app/services/observ/evaluators/exact_match_evaluator.rb +23 -0
  87. data/app/services/observ/evaluators/json_structure_evaluator.rb +44 -0
  88. data/app/services/observ/prompt_manager/cache_statistics.rb +82 -0
  89. data/app/services/observ/prompt_manager/caching.rb +167 -0
  90. data/app/services/observ/prompt_manager/comparison.rb +49 -0
  91. data/app/services/observ/prompt_manager/version_management.rb +96 -0
  92. data/app/services/observ/prompt_manager.rb +40 -0
  93. data/app/services/observ/trace_text_formatter.rb +349 -0
  94. data/app/validators/observ/prompt_config_validator.rb +187 -0
  95. data/app/views/kaminari/_first_page.html.erb +11 -0
  96. data/app/views/kaminari/_gap.html.erb +8 -0
  97. data/app/views/kaminari/_last_page.html.erb +11 -0
  98. data/app/views/kaminari/_next_page.html.erb +11 -0
  99. data/app/views/kaminari/_page.html.erb +12 -0
  100. data/app/views/kaminari/_paginator.html.erb +25 -0
  101. data/app/views/kaminari/_prev_page.html.erb +11 -0
  102. data/app/views/kaminari/observ/_first_page.html.erb +11 -0
  103. data/app/views/kaminari/observ/_gap.html.erb +8 -0
  104. data/app/views/kaminari/observ/_last_page.html.erb +11 -0
  105. data/app/views/kaminari/observ/_next_page.html.erb +11 -0
  106. data/app/views/kaminari/observ/_page.html.erb +12 -0
  107. data/app/views/kaminari/observ/_paginator.html.erb +25 -0
  108. data/app/views/kaminari/observ/_prev_page.html.erb +11 -0
  109. data/app/views/layouts/observ/application.html.erb +88 -0
  110. data/app/views/observ/annotations/_annotation.html.erb +13 -0
  111. data/app/views/observ/annotations/_form.html.erb +28 -0
  112. data/app/views/observ/annotations/index.html.erb +28 -0
  113. data/app/views/observ/annotations/sessions_index.html.erb +48 -0
  114. data/app/views/observ/annotations/traces_index.html.erb +48 -0
  115. data/app/views/observ/chats/_form.html.erb +45 -0
  116. data/app/views/observ/chats/index.html.erb +67 -0
  117. data/app/views/observ/chats/new.html.erb +17 -0
  118. data/app/views/observ/chats/show.html.erb +34 -0
  119. data/app/views/observ/dashboard/index.html.erb +236 -0
  120. data/app/views/observ/dataset_items/_form.html.erb +49 -0
  121. data/app/views/observ/dataset_items/edit.html.erb +18 -0
  122. data/app/views/observ/dataset_items/index.html.erb +95 -0
  123. data/app/views/observ/dataset_items/new.html.erb +18 -0
  124. data/app/views/observ/dataset_run_items/_score_close_drawer.html.erb +4 -0
  125. data/app/views/observ/dataset_run_items/_score_drawer.html.erb +75 -0
  126. data/app/views/observ/dataset_run_items/_score_success.html.erb +29 -0
  127. data/app/views/observ/dataset_run_items/_scores_cell.html.erb +19 -0
  128. data/app/views/observ/dataset_run_items/details_drawer.turbo_stream.erb +80 -0
  129. data/app/views/observ/dataset_run_items/score_drawer.turbo_stream.erb +7 -0
  130. data/app/views/observ/dataset_runs/index.html.erb +108 -0
  131. data/app/views/observ/dataset_runs/new.html.erb +57 -0
  132. data/app/views/observ/dataset_runs/review.html.erb +155 -0
  133. data/app/views/observ/dataset_runs/show.html.erb +166 -0
  134. data/app/views/observ/datasets/_form.html.erb +62 -0
  135. data/app/views/observ/datasets/_items_tab.html.erb +66 -0
  136. data/app/views/observ/datasets/_runs_tab.html.erb +82 -0
  137. data/app/views/observ/datasets/edit.html.erb +32 -0
  138. data/app/views/observ/datasets/index.html.erb +105 -0
  139. data/app/views/observ/datasets/new.html.erb +18 -0
  140. data/app/views/observ/datasets/show.html.erb +67 -0
  141. data/app/views/observ/messages/_content.html.erb +1 -0
  142. data/app/views/observ/messages/_form.html.erb +33 -0
  143. data/app/views/observ/messages/_message.html.erb +14 -0
  144. data/app/views/observ/messages/_tool_calls.html.erb +10 -0
  145. data/app/views/observ/messages/create.turbo_stream.erb +9 -0
  146. data/app/views/observ/observations/index.html.erb +97 -0
  147. data/app/views/observ/observations/show_generation.html.erb +195 -0
  148. data/app/views/observ/observations/show_span.html.erb +93 -0
  149. data/app/views/observ/prompts/_diff_content.html.erb +16 -0
  150. data/app/views/observ/prompts/_form.html.erb +111 -0
  151. data/app/views/observ/prompts/_new_form.html.erb +102 -0
  152. data/app/views/observ/prompts/_prompt_actions.html.erb +4 -0
  153. data/app/views/observ/prompts/_prompt_content_highlighted.html.erb +4 -0
  154. data/app/views/observ/prompts/_version_actions.html.erb +40 -0
  155. data/app/views/observ/prompts/compare.html.erb +155 -0
  156. data/app/views/observ/prompts/edit.html.erb +17 -0
  157. data/app/views/observ/prompts/index.html.erb +108 -0
  158. data/app/views/observ/prompts/new.html.erb +17 -0
  159. data/app/views/observ/prompts/show.html.erb +138 -0
  160. data/app/views/observ/prompts/versions.html.erb +87 -0
  161. data/app/views/observ/sessions/annotations_drawer.turbo_stream.erb +25 -0
  162. data/app/views/observ/sessions/drawer_test.turbo_stream.erb +49 -0
  163. data/app/views/observ/sessions/index.html.erb +91 -0
  164. data/app/views/observ/sessions/show.html.erb +251 -0
  165. data/app/views/observ/traces/add_to_dataset_drawer.turbo_stream.erb +48 -0
  166. data/app/views/observ/traces/annotations_drawer.turbo_stream.erb +25 -0
  167. data/app/views/observ/traces/index.html.erb +87 -0
  168. data/app/views/observ/traces/show.html.erb +285 -0
  169. data/app/views/observ/traces/text_output_drawer.turbo_stream.erb +48 -0
  170. data/app/views/shared/_drawer.html.erb +26 -0
  171. data/config/routes.rb +80 -0
  172. data/db/migrate/001_create_observ_sessions.rb +21 -0
  173. data/db/migrate/002_create_observ_traces.rb +25 -0
  174. data/db/migrate/003_create_observ_observations.rb +42 -0
  175. data/db/migrate/004_add_message_id_to_observ_traces.rb +7 -0
  176. data/db/migrate/005_create_observ_prompts.rb +21 -0
  177. data/db/migrate/006_fix_prompt_config_strings.rb +23 -0
  178. data/db/migrate/007_create_observ_annotations.rb +12 -0
  179. data/db/migrate/009_add_prompt_fields_to_observ_chats.rb +11 -0
  180. data/db/migrate/010_create_observ_datasets.rb +15 -0
  181. data/db/migrate/011_create_observ_dataset_items.rb +17 -0
  182. data/db/migrate/012_create_observ_dataset_runs.rb +22 -0
  183. data/db/migrate/013_create_observ_dataset_run_items.rb +16 -0
  184. data/db/migrate/014_create_observ_scores.rb +26 -0
  185. data/lib/generators/observ/add_phase_tracking/add_phase_tracking_generator.rb +150 -0
  186. data/lib/generators/observ/add_phase_tracking/templates/migration.rb.tt +6 -0
  187. data/lib/generators/observ/install/USAGE +27 -0
  188. data/lib/generators/observ/install/install_generator.rb +270 -0
  189. data/lib/generators/observ/install_chat/install_chat_generator.rb +313 -0
  190. data/lib/generators/observ/install_chat/templates/agents/base_agent.rb.tt +147 -0
  191. data/lib/generators/observ/install_chat/templates/agents/simple_agent.rb.tt +55 -0
  192. data/lib/generators/observ/install_chat/templates/concerns/observ_chat_enhancements.rb.tt +34 -0
  193. data/lib/generators/observ/install_chat/templates/concerns/observ_message_enhancements.rb.tt +18 -0
  194. data/lib/generators/observ/install_chat/templates/initializers/observability.rb.tt +20 -0
  195. data/lib/generators/observ/install_chat/templates/jobs/chat_response_job.rb.tt +56 -0
  196. data/lib/generators/observ/install_chat/templates/migrations/add_agent_class_name.rb.tt +6 -0
  197. data/lib/generators/observ/install_chat/templates/migrations/add_observability_session_id.rb.tt +6 -0
  198. data/lib/generators/observ/install_chat/templates/tools/think_tool.rb.tt +29 -0
  199. data/lib/generators/observ/install_chat/templates/views/messages/_content.html.erb.tt +1 -0
  200. data/lib/observ/asset_installer.rb +130 -0
  201. data/lib/observ/asset_syncer.rb +104 -0
  202. data/lib/observ/configuration.rb +108 -0
  203. data/lib/observ/engine.rb +50 -0
  204. data/lib/observ/index_file_generator.rb +142 -0
  205. data/lib/observ/instrumenter/ruby_llm.rb +6 -0
  206. data/lib/observ/version.rb +3 -0
  207. data/lib/observ.rb +29 -0
  208. data/lib/tasks/observ_tasks.rake +75 -0
  209. metadata +453 -0
@@ -0,0 +1,108 @@
1
+ <% content_for :title, "Runs - #{@dataset.name}" %>
2
+
3
+ <% content_for :page_header do %>
4
+ <div class="observ-page-header__content">
5
+ <div>
6
+ <%= link_to "← Back to Dataset", dataset_path(@dataset), class: "observ-datasets__back-link" %>
7
+ <h1 class="observ-page-header__title">Runs: <%= @dataset.name %></h1>
8
+ </div>
9
+ <%= link_to "New Run", new_dataset_run_path(@dataset), class: "observ-button observ-button--primary" %>
10
+ </div>
11
+ <% end %>
12
+
13
+ <div class="observ-container">
14
+ <!-- Filters -->
15
+ <section class="observ-card">
16
+ <div class="observ-card__body">
17
+ <%= form_with url: dataset_runs_path(@dataset), method: :get, class: "observ-datasets-filters__form" do |f| %>
18
+ <div class="observ-datasets-filters__field">
19
+ <%= f.label :status, "Filter by status", class: "observ-datasets-filters__label" %>
20
+ <%= f.select :status,
21
+ options_for_select([["All", ""], ["Pending", "pending"], ["Running", "running"], ["Completed", "completed"], ["Failed", "failed"]], params[:status]),
22
+ {},
23
+ class: "observ-datasets-filters__select" %>
24
+ </div>
25
+ <div class="observ-datasets-filters__actions">
26
+ <%= f.submit "Filter", class: "observ-button observ-button--secondary" %>
27
+ <%= link_to "Clear", dataset_runs_path(@dataset), class: "observ-button" %>
28
+ </div>
29
+ <% end %>
30
+ </div>
31
+ </section>
32
+
33
+ <!-- Runs Table -->
34
+ <section class="observ-card">
35
+ <div class="observ-card__body">
36
+ <% if @runs.any? %>
37
+ <table class="observ-table">
38
+ <thead class="observ-table__header">
39
+ <tr class="observ-table__row">
40
+ <th class="observ-table__cell">Name</th>
41
+ <th class="observ-table__cell">Status</th>
42
+ <th class="observ-table__cell">Progress</th>
43
+ <th class="observ-table__cell observ-table__cell--numeric">Cost</th>
44
+ <th class="observ-table__cell observ-table__cell--numeric">Tokens</th>
45
+ <th class="observ-table__cell">Created</th>
46
+ <th class="observ-table__cell observ-table__cell--actions"></th>
47
+ </tr>
48
+ </thead>
49
+ <tbody>
50
+ <% @runs.each do |run| %>
51
+ <tr class="observ-table__row">
52
+ <td class="observ-table__cell">
53
+ <%= link_to run.name, dataset_run_path(@dataset, run), class: "observ-datasets-table__link" %>
54
+ <% if run.description.present? %>
55
+ <p class="observ-datasets-table__description"><%= truncate(run.description, length: 50) %></p>
56
+ <% end %>
57
+ </td>
58
+ <td class="observ-table__cell">
59
+ <span class="observ-badge <%= run_status_badge_class(run.status) %>">
60
+ <%= run.status %>
61
+ </span>
62
+ </td>
63
+ <td class="observ-table__cell">
64
+ <div class="observ-datasets__progress">
65
+ <div class="observ-datasets__progress-bar">
66
+ <div class="observ-datasets__progress-fill observ-datasets__progress-fill--success"
67
+ style="width: <%= run.success_rate %>%"></div>
68
+ <div class="observ-datasets__progress-fill observ-datasets__progress-fill--danger"
69
+ style="width: <%= run.failure_rate %>%"></div>
70
+ </div>
71
+ <span class="observ-datasets__progress-text">
72
+ <%= run.completed_items %>/<%= run.total_items %>
73
+ </span>
74
+ </div>
75
+ </td>
76
+ <td class="observ-table__cell observ-table__cell--numeric">
77
+ $<%= number_with_precision(run.total_cost, precision: 4) %>
78
+ </td>
79
+ <td class="observ-table__cell observ-table__cell--numeric">
80
+ <%= number_with_delimiter(run.total_tokens) %>
81
+ </td>
82
+ <td class="observ-table__cell">
83
+ <%= time_ago_in_words(run.created_at) %> ago
84
+ </td>
85
+ <td class="observ-table__cell observ-table__cell--actions">
86
+ <div class="observ-datasets-table__action-group">
87
+ <%= link_to "View", dataset_run_path(@dataset, run), class: "observ-button observ-button--sm" %>
88
+ <%= button_to "Delete", dataset_run_path(@dataset, run),
89
+ method: :delete,
90
+ class: "observ-button observ-button--sm observ-button--danger",
91
+ data: { confirm: "Are you sure?" } %>
92
+ </div>
93
+ </td>
94
+ </tr>
95
+ <% end %>
96
+ </tbody>
97
+ </table>
98
+ <% else %>
99
+ <div class="observ-card__empty">
100
+ <p class="observ-card__empty-text">No runs found</p>
101
+ <%= link_to "Start a new run", new_dataset_run_path(@dataset), class: "observ-button observ-button--primary" %>
102
+ </div>
103
+ <% end %>
104
+ </div>
105
+ </section>
106
+
107
+ <%= observ_pagination(@runs) %>
108
+ </div>
@@ -0,0 +1,57 @@
1
+ <% content_for :title, "New Run - #{@dataset.name}" %>
2
+
3
+ <% content_for :page_header do %>
4
+ <div class="observ-page-header__content">
5
+ <div>
6
+ <%= link_to "← Back to Dataset", dataset_path(@dataset, tab: "runs"), class: "observ-datasets__back-link" %>
7
+ <h1 class="observ-page-header__title">New Run: <%= @dataset.name %></h1>
8
+ </div>
9
+ </div>
10
+ <% end %>
11
+
12
+ <div class="observ-container">
13
+ <!-- Run Info -->
14
+ <section class="observ-card">
15
+ <div class="observ-card__body">
16
+ <div class="observ-alert observ-alert--info">
17
+ <p>This run will evaluate <strong><%= @dataset.active_items_count %> active items</strong> using the <strong><%= @dataset.agent_class %></strong> agent.</p>
18
+ </div>
19
+
20
+ <%= form_with model: [@dataset, @run], url: dataset_runs_path(@dataset), class: "observ-form" do |f| %>
21
+ <% if @run.errors.any? %>
22
+ <div class="observ-alert observ-alert--danger">
23
+ <h3 class="observ-alert__title">Please fix the following errors:</h3>
24
+ <ul class="observ-alert__list">
25
+ <% @run.errors.full_messages.each do |message| %>
26
+ <li><%= message %></li>
27
+ <% end %>
28
+ </ul>
29
+ </div>
30
+ <% end %>
31
+
32
+ <div class="observ-form__group">
33
+ <%= f.label :name, class: "observ-form__label" %>
34
+ <%= f.text_field :name,
35
+ class: "observ-form__input",
36
+ placeholder: "e.g., v1.0-gpt4, baseline, experiment-1",
37
+ required: true %>
38
+ <p class="observ-form__hint">A unique name for this run (used for comparison)</p>
39
+ </div>
40
+
41
+ <div class="observ-form__group">
42
+ <%= f.label :description, class: "observ-form__label" %>
43
+ <%= f.text_area :description,
44
+ class: "observ-form__textarea",
45
+ rows: 3,
46
+ placeholder: "Describe the purpose of this run..." %>
47
+ <p class="observ-form__hint">Optional description of what you're testing</p>
48
+ </div>
49
+
50
+ <div class="observ-form__actions">
51
+ <%= f.submit "Start Run", class: "observ-button observ-button--primary" %>
52
+ <%= link_to "Cancel", dataset_path(@dataset, tab: "runs"), class: "observ-button" %>
53
+ </div>
54
+ <% end %>
55
+ </div>
56
+ </section>
57
+ </div>
@@ -0,0 +1,155 @@
1
+ <% content_for :title, "Review - #{@run.name}" %>
2
+
3
+ <% content_for :page_header do %>
4
+ <div class="observ-page-header__content">
5
+ <div class="observ-page-header__breadcrumb">
6
+ <%= link_to "Datasets", datasets_path, class: "observ-link" %> /
7
+ <%= link_to @dataset.name, dataset_path(@dataset, tab: "runs"), class: "observ-link" %> /
8
+ <%= link_to @run.name, dataset_run_path(@dataset, @run), class: "observ-link" %> /
9
+ </div>
10
+ <h1 class="observ-page-header__title">Review Mode</h1>
11
+ </div>
12
+ <div class="observ-page-header__actions">
13
+ <%= link_to "Exit Review", dataset_run_path(@dataset, @run), class: "observ-button" %>
14
+ </div>
15
+ <% end %>
16
+
17
+ <div class="observ-container">
18
+ <!-- Progress Bar -->
19
+ <section class="observ-review__progress-section">
20
+ <div class="observ-review__progress-header">
21
+ <span class="observ-review__progress-label">
22
+ Scoring Progress: <strong><%= @progress[:scored] %></strong> of <strong><%= @progress[:total] %></strong> items
23
+ </span>
24
+ <span class="observ-review__progress-percent">
25
+ <%= @progress[:total] > 0 ? ((@progress[:scored].to_f / @progress[:total]) * 100).round(0) : 0 %>%
26
+ </span>
27
+ </div>
28
+ <div class="observ-datasets__progress">
29
+ <div class="observ-datasets__progress-bar">
30
+ <div class="observ-datasets__progress-fill observ-datasets__progress-fill--success"
31
+ style="width: <%= @progress[:total] > 0 ? ((@progress[:scored].to_f / @progress[:total]) * 100) : 0 %>%"></div>
32
+ </div>
33
+ </div>
34
+ </section>
35
+
36
+ <!-- Review Card -->
37
+ <section class="observ-card observ-review__card">
38
+ <div class="observ-card__header">
39
+ <h2 class="observ-card__title">Item #<%= @run_item.id %></h2>
40
+ <% if @run_item.trace %>
41
+ <%= link_to "View Trace", trace_path(@run_item.trace), class: "observ-button observ-button--sm", target: "_blank" %>
42
+ <% end %>
43
+ </div>
44
+
45
+ <div class="observ-card__body">
46
+ <!-- Input / Expected / Actual Grid -->
47
+ <div class="observ-review__data-grid">
48
+ <div class="observ-review__data-section">
49
+ <h3 class="observ-review__data-label">Input</h3>
50
+ <pre class="observ-code-block observ-review__code-block"><%= format_trace_data(@run_item.input) %></pre>
51
+ </div>
52
+
53
+ <% if @run_item.expected_output.present? %>
54
+ <div class="observ-review__data-section">
55
+ <h3 class="observ-review__data-label">Expected Output</h3>
56
+ <pre class="observ-code-block observ-review__code-block"><%= format_trace_data(@run_item.expected_output) %></pre>
57
+ </div>
58
+ <% end %>
59
+
60
+ <div class="observ-review__data-section observ-review__data-section--highlight">
61
+ <h3 class="observ-review__data-label">Actual Output</h3>
62
+ <pre class="observ-code-block observ-review__code-block"><%= format_trace_data(@run_item.actual_output) %></pre>
63
+ </div>
64
+ </div>
65
+
66
+ <!-- Existing Scores (if any) -->
67
+ <% if @run_item.scores.any? %>
68
+ <div class="observ-review__existing-scores">
69
+ <h4 class="observ-text--label">Existing Scores</h4>
70
+ <ul class="observ-scores-list">
71
+ <% @run_item.scores.each do |score| %>
72
+ <li class="observ-scores-list__item">
73
+ <span class="observ-scores-list__indicator <%= score.passed? ? 'observ-scores-list__indicator--pass' : 'observ-scores-list__indicator--fail' %>">
74
+ <%= score.passed? ? '✓' : '✗' %>
75
+ </span>
76
+ <span class="observ-scores-list__name"><%= score.name %></span>
77
+ <% unless score.boolean? %>
78
+ <span class="observ-scores-list__value"><%= score.display_value %></span>
79
+ <% end %>
80
+ <span class="observ-scores-list__source"><%= score.source %></span>
81
+ </li>
82
+ <% end %>
83
+ </ul>
84
+ </div>
85
+ <% end %>
86
+
87
+ <!-- Scoring Form -->
88
+ <%= form_with url: score_dataset_run_run_item_path(@dataset, @run, @run_item),
89
+ method: :post,
90
+ class: "observ-review__form",
91
+ data: { turbo: false } do |f| %>
92
+ <%= hidden_field_tag :review_mode, "1" %>
93
+
94
+ <div class="observ-review__scoring-section">
95
+ <h3 class="observ-review__scoring-label">Is the output correct?</h3>
96
+
97
+ <div class="observ-review__score-buttons">
98
+ <label class="observ-review__score-button observ-review__score-button--pass">
99
+ <input type="radio" name="value" value="1" <%= "checked" if @existing_manual&.passed? %>>
100
+ <span class="observ-review__score-icon">&#10003;</span>
101
+ <span class="observ-review__score-text">Correct</span>
102
+ <span class="observ-review__score-shortcut">Press C</span>
103
+ </label>
104
+
105
+ <label class="observ-review__score-button observ-review__score-button--fail">
106
+ <input type="radio" name="value" value="0" <%= "checked" if @existing_manual&.failed? %>>
107
+ <span class="observ-review__score-icon">&#10005;</span>
108
+ <span class="observ-review__score-text">Incorrect</span>
109
+ <span class="observ-review__score-shortcut">Press X</span>
110
+ </label>
111
+ </div>
112
+
113
+ <div class="observ-form__group observ-review__comment-group">
114
+ <label class="observ-form__label" for="comment">Comment (optional)</label>
115
+ <textarea name="comment" id="comment" class="observ-form__textarea" rows="2" placeholder="Add notes about this score..."><%= @existing_manual&.comment %></textarea>
116
+ </div>
117
+ </div>
118
+
119
+ <div class="observ-review__actions">
120
+ <button type="submit" class="observ-button observ-button--primary observ-button--lg">
121
+ Save & Next
122
+ <span class="observ-review__action-shortcut">Enter</span>
123
+ </button>
124
+
125
+ <%= link_to dataset_run_path(@dataset, @run),
126
+ class: "observ-button observ-button--lg" do %>
127
+ Skip & Exit
128
+ <% end %>
129
+ </div>
130
+ <% end %>
131
+ </div>
132
+ </section>
133
+ </div>
134
+
135
+ <script>
136
+ // Keyboard shortcuts for faster reviewing
137
+ document.addEventListener('keydown', function(event) {
138
+ // Ignore if user is typing in textarea
139
+ if (event.target.tagName === 'TEXTAREA' || event.target.tagName === 'INPUT') {
140
+ return;
141
+ }
142
+
143
+ if (event.key === 'c' || event.key === 'C') {
144
+ document.querySelector('input[name="value"][value="1"]').checked = true;
145
+ } else if (event.key === 'x' || event.key === 'X') {
146
+ document.querySelector('input[name="value"][value="0"]').checked = true;
147
+ } else if (event.key === 'Enter') {
148
+ const form = document.querySelector('.observ-review__form');
149
+ const selected = document.querySelector('input[name="value"]:checked');
150
+ if (selected) {
151
+ form.submit();
152
+ }
153
+ }
154
+ });
155
+ </script>
@@ -0,0 +1,166 @@
1
+ <% content_for :title, "#{@run.name} - #{@dataset.name}" %>
2
+
3
+ <% content_for :page_header do %>
4
+ <div class="observ-page-header__content">
5
+ <div class="observ-page-header__breadcrumb">
6
+ <%= link_to "Datasets", datasets_path, class: "observ-link" %> /
7
+ <%= link_to @dataset.name, dataset_path(@dataset, tab: "runs"), class: "observ-link" %> /
8
+ </div>
9
+ <h1 class="observ-page-header__title"><%= @run.name %></h1>
10
+ <% if @run.description.present? %>
11
+ <p class="observ-page-header__subtitle"><%= @run.description %></p>
12
+ <% end %>
13
+ </div>
14
+ <div class="observ-page-header__actions">
15
+ <%= link_to "Review Items", review_dataset_run_path(@dataset, @run),
16
+ class: "observ-button observ-button--primary" %>
17
+ <%= button_to "Run Evaluators", run_evaluators_dataset_run_path(@dataset, @run),
18
+ method: :post,
19
+ class: "observ-button observ-button--secondary",
20
+ disabled: @run.in_progress? %>
21
+ <%= button_to "Delete Run", dataset_run_path(@dataset, @run),
22
+ method: :delete,
23
+ class: "observ-button observ-button--danger",
24
+ data: { confirm: "Are you sure you want to delete this run?" } %>
25
+ </div>
26
+ <% end %>
27
+
28
+ <div class="observ-container">
29
+ <!-- Run Summary -->
30
+ <section class="observ-card">
31
+ <div class="observ-card__body">
32
+ <!-- Progress Section -->
33
+ <div class="observ-datasets__run-progress">
34
+ <div class="observ-datasets__run-progress-header">
35
+ <span class="observ-badge <%= run_status_badge_class(@run.status) %>">
36
+ <%= @run.status %>
37
+ </span>
38
+ <span class="observ-datasets__run-progress-stats">
39
+ <%= @run.completed_items %> / <%= @run.total_items %> completed
40
+ <% if @run.failed_items > 0 %>
41
+ <span class="observ-text--danger">(<%= @run.failed_items %> failed)</span>
42
+ <% end %>
43
+ </span>
44
+ </div>
45
+ <div class="observ-datasets__progress">
46
+ <div class="observ-datasets__progress-bar">
47
+ <div class="observ-datasets__progress-fill observ-datasets__progress-fill--success"
48
+ style="width: <%= @run.success_rate %>%"></div>
49
+ <div class="observ-datasets__progress-fill observ-datasets__progress-fill--danger"
50
+ style="width: <%= @run.failure_rate %>%"></div>
51
+ </div>
52
+ <span class="observ-datasets__progress-text">
53
+ <%= @run.progress_percentage %>%
54
+ </span>
55
+ </div>
56
+ </div>
57
+
58
+ <!-- Metadata Grid -->
59
+ <dl class="observ-datasets__metadata observ-datasets__metadata--horizontal">
60
+ <div class="observ-datasets__metadata-item">
61
+ <dt class="observ-datasets__metadata-label">Total Cost</dt>
62
+ <dd class="observ-datasets__metadata-value">$<%= number_with_precision(@run.total_cost, precision: 4) %></dd>
63
+ </div>
64
+ <div class="observ-datasets__metadata-item">
65
+ <dt class="observ-datasets__metadata-label">Total Tokens</dt>
66
+ <dd class="observ-datasets__metadata-value"><%= number_with_delimiter(@run.total_tokens) %></dd>
67
+ </div>
68
+ <div class="observ-datasets__metadata-item">
69
+ <dt class="observ-datasets__metadata-label">Created</dt>
70
+ <dd class="observ-datasets__metadata-value"><%= @run.created_at.strftime("%b %d, %Y at %I:%M %p") %></dd>
71
+ </div>
72
+ <% if @run.scores.any? %>
73
+ <div class="observ-datasets__metadata-item">
74
+ <dt class="observ-datasets__metadata-label">Score Summary</dt>
75
+ <dd class="observ-datasets__metadata-value">
76
+ <% @run.score_summary.each do |name, avg| %>
77
+ <span class="observ-badge observ-badge--sm"><%= name %>: <%= (avg * 100).round(1) %>%</span>
78
+ <% end %>
79
+ </dd>
80
+ </div>
81
+ <div class="observ-datasets__metadata-item">
82
+ <dt class="observ-datasets__metadata-label">Items Scored</dt>
83
+ <dd class="observ-datasets__metadata-value"><%= @run.items_with_scores_count %> / <%= @run.total_items %></dd>
84
+ </div>
85
+ <% end %>
86
+ </dl>
87
+ </div>
88
+ </section>
89
+
90
+ <!-- Run Items -->
91
+ <section class="observ-card">
92
+ <div class="observ-card__header">
93
+ <h2 class="observ-card__title">Run Items</h2>
94
+ </div>
95
+ <div class="observ-card__body">
96
+ <% if @run_items.any? %>
97
+ <table class="observ-table">
98
+ <thead class="observ-table__header">
99
+ <tr class="observ-table__row">
100
+ <th class="observ-table__cell">Input</th>
101
+ <th class="observ-table__cell">Expected</th>
102
+ <th class="observ-table__cell">Actual</th>
103
+ <th class="observ-table__cell">Scores</th>
104
+ <th class="observ-table__cell observ-table__cell--actions"></th>
105
+ </tr>
106
+ </thead>
107
+ <tbody>
108
+ <% @run_items.each do |run_item| %>
109
+ <tr class="observ-table__row">
110
+ <td class="observ-table__cell observ-datasets__cell--preview">
111
+ <code class="observ-datasets__preview"><%= run_item.dataset_item.input_preview(max_length: 60) %></code>
112
+ </td>
113
+ <td class="observ-table__cell observ-datasets__cell--preview">
114
+ <% if run_item.expected_output.present? %>
115
+ <code class="observ-datasets__preview"><%= run_item.dataset_item.expected_output_preview(max_length: 60) %></code>
116
+ <% else %>
117
+ <span class="observ-text--muted">-</span>
118
+ <% end %>
119
+ </td>
120
+ <td class="observ-table__cell observ-datasets__cell--preview">
121
+ <% if run_item.actual_output.present? %>
122
+ <code class="observ-datasets__preview"><%= truncate(run_item.actual_output.to_s, length: 60) %></code>
123
+ <% elsif run_item.failed? %>
124
+ <span class="observ-text--danger" title="<%= run_item.error %>">Error: <%= truncate(run_item.error, length: 40) %></span>
125
+ <% else %>
126
+ <span class="observ-text--muted">Pending</span>
127
+ <% end %>
128
+ </td>
129
+ <%= render "observ/dataset_run_items/scores_cell", run_item: run_item %>
130
+ <td class="observ-table__cell observ-table__cell--actions">
131
+ <div class="observ-datasets-table__action-group">
132
+ <% if run_item.succeeded? %>
133
+ <%= link_to "Score",
134
+ "#",
135
+ class: "observ-button observ-button--sm",
136
+ data: {
137
+ action: "click->observ--drawer#open",
138
+ drawer_url_param: score_drawer_dataset_run_run_item_path(@dataset, @run, run_item)
139
+ } %>
140
+ <% end %>
141
+ <%= link_to "Details",
142
+ "#",
143
+ class: "observ-button observ-button--sm",
144
+ data: {
145
+ action: "click->observ--drawer#open",
146
+ drawer_url_param: details_drawer_dataset_run_run_item_path(@dataset, @run, run_item)
147
+ } %>
148
+ <% if run_item.trace %>
149
+ <%= link_to "Trace", trace_path(run_item.trace), class: "observ-button observ-button--sm" %>
150
+ <% end %>
151
+ </div>
152
+ </td>
153
+ </tr>
154
+ <% end %>
155
+ </tbody>
156
+ </table>
157
+
158
+ <%= observ_pagination(@run_items) %>
159
+ <% else %>
160
+ <div class="observ-card__empty">
161
+ <p class="observ-card__empty-text">No items in this run</p>
162
+ </div>
163
+ <% end %>
164
+ </div>
165
+ </section>
166
+ </div>
@@ -0,0 +1,62 @@
1
+ <%= form_with model: dataset,
2
+ scope: :observ_dataset,
3
+ url: dataset.persisted? ? dataset_path(dataset) : datasets_path,
4
+ method: dataset.persisted? ? :patch : :post,
5
+ class: "observ-form" do |f| %>
6
+
7
+ <% if dataset.errors.any? %>
8
+ <div class="observ-alert observ-alert--danger">
9
+ <h3 class="observ-alert__title">Please fix the following errors:</h3>
10
+ <ul class="observ-alert__list">
11
+ <% dataset.errors.full_messages.each do |message| %>
12
+ <li><%= message %></li>
13
+ <% end %>
14
+ </ul>
15
+ </div>
16
+ <% end %>
17
+
18
+ <div class="observ-form__group">
19
+ <%= f.label :name, class: "observ-form__label" %>
20
+ <%= f.text_field :name,
21
+ class: "observ-form__input",
22
+ placeholder: "e.g., language-detection-tests",
23
+ required: true %>
24
+ <p class="observ-form__hint">A unique name for this dataset</p>
25
+ </div>
26
+
27
+ <div class="observ-form__group">
28
+ <%= f.label :description, class: "observ-form__label" %>
29
+ <%= f.text_area :description,
30
+ class: "observ-form__textarea",
31
+ rows: 3,
32
+ placeholder: "Describe what this dataset tests..." %>
33
+ <p class="observ-form__hint">Optional description of this dataset's purpose</p>
34
+ </div>
35
+
36
+ <div class="observ-form__group">
37
+ <%= f.label :agent_class, "Agent", class: "observ-form__label" %>
38
+ <% if agents.any? %>
39
+ <%= f.select :agent_class,
40
+ options_for_select(agents, dataset.agent_class),
41
+ { prompt: "Select an agent..." },
42
+ { class: "observ-form__select", required: true } %>
43
+ <% else %>
44
+ <%= f.text_field :agent_class,
45
+ class: "observ-form__input",
46
+ placeholder: "e.g., MyAgent",
47
+ required: true %>
48
+ <p class="observ-form__hint observ-form__hint--warning">
49
+ No agents found. Enter the agent class name manually.
50
+ </p>
51
+ <% end %>
52
+ <p class="observ-form__hint">The agent that will be run against this dataset</p>
53
+ </div>
54
+
55
+ <div class="observ-form__actions">
56
+ <%= f.submit dataset.persisted? ? "Update Dataset" : "Create Dataset",
57
+ class: "observ-button observ-button--primary" %>
58
+ <%= link_to "Cancel",
59
+ dataset.persisted? ? dataset_path(dataset) : datasets_path,
60
+ class: "observ-button" %>
61
+ </div>
62
+ <% end %>
@@ -0,0 +1,66 @@
1
+ <section class="observ-card">
2
+ <div class="observ-card__header">
3
+ <h2 class="observ-card__title">Dataset Items</h2>
4
+ <%= link_to "Add Item", new_dataset_item_path(@dataset), class: "observ-button observ-button--sm observ-button--primary" %>
5
+ </div>
6
+ <div class="observ-card__body">
7
+ <% if @items.any? %>
8
+ <table class="observ-table">
9
+ <thead class="observ-table__header">
10
+ <tr class="observ-table__row">
11
+ <th class="observ-table__cell">Input</th>
12
+ <th class="observ-table__cell">Expected Output</th>
13
+ <th class="observ-table__cell">Status</th>
14
+ <th class="observ-table__cell observ-table__cell--numeric">Runs</th>
15
+ <th class="observ-table__cell">Created</th>
16
+ <th class="observ-table__cell observ-table__cell--actions"></th>
17
+ </tr>
18
+ </thead>
19
+ <tbody>
20
+ <% @items.each do |item| %>
21
+ <tr class="observ-table__row">
22
+ <td class="observ-table__cell observ-datasets__cell--preview">
23
+ <code class="observ-datasets__preview"><%= item.input_preview(max_length: 80) %></code>
24
+ </td>
25
+ <td class="observ-table__cell observ-datasets__cell--preview">
26
+ <% if item.expected_output.present? %>
27
+ <code class="observ-datasets__preview"><%= item.expected_output_preview(max_length: 80) %></code>
28
+ <% else %>
29
+ <span class="observ-text--muted">Not set</span>
30
+ <% end %>
31
+ </td>
32
+ <td class="observ-table__cell">
33
+ <span class="observ-badge <%= item.active? ? 'observ-badge--success' : 'observ-badge--default' %>">
34
+ <%= item.status %>
35
+ </span>
36
+ </td>
37
+ <td class="observ-table__cell observ-table__cell--numeric">
38
+ <%= item.run_count %>
39
+ </td>
40
+ <td class="observ-table__cell">
41
+ <%= time_ago_in_words(item.created_at) %> ago
42
+ </td>
43
+ <td class="observ-table__cell observ-table__cell--actions">
44
+ <div class="observ-datasets-table__action-group">
45
+ <%= link_to "Edit", edit_dataset_item_path(@dataset, item), class: "observ-button observ-button--sm" %>
46
+ <%= button_to "Delete", dataset_item_path(@dataset, item),
47
+ method: :delete,
48
+ class: "observ-button observ-button--sm observ-button--danger",
49
+ data: { confirm: "Are you sure you want to delete this item?" } %>
50
+ </div>
51
+ </td>
52
+ </tr>
53
+ <% end %>
54
+ </tbody>
55
+ </table>
56
+
57
+ <%= observ_pagination(@items) %>
58
+ <% else %>
59
+ <div class="observ-card__empty">
60
+ <p class="observ-card__empty-text">No items in this dataset</p>
61
+ <p class="observ-card__empty-subtext">Add test cases to start evaluating your agent</p>
62
+ <%= link_to "Add your first item", new_dataset_item_path(@dataset), class: "observ-button observ-button--primary" %>
63
+ </div>
64
+ <% end %>
65
+ </div>
66
+ </section>