rasa-pro 3.8.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rasa-pro might be problematic. Click here for more details.

Files changed (644) hide show
  1. README.md +380 -0
  2. rasa/__init__.py +10 -0
  3. rasa/__main__.py +151 -0
  4. rasa/anonymization/__init__.py +2 -0
  5. rasa/anonymization/anonymisation_rule_yaml_reader.py +91 -0
  6. rasa/anonymization/anonymization_pipeline.py +287 -0
  7. rasa/anonymization/anonymization_rule_executor.py +260 -0
  8. rasa/anonymization/anonymization_rule_orchestrator.py +120 -0
  9. rasa/anonymization/schemas/config.yml +47 -0
  10. rasa/anonymization/utils.py +117 -0
  11. rasa/api.py +146 -0
  12. rasa/cli/__init__.py +5 -0
  13. rasa/cli/arguments/__init__.py +0 -0
  14. rasa/cli/arguments/data.py +81 -0
  15. rasa/cli/arguments/default_arguments.py +165 -0
  16. rasa/cli/arguments/evaluate.py +65 -0
  17. rasa/cli/arguments/export.py +51 -0
  18. rasa/cli/arguments/interactive.py +74 -0
  19. rasa/cli/arguments/run.py +204 -0
  20. rasa/cli/arguments/shell.py +13 -0
  21. rasa/cli/arguments/test.py +211 -0
  22. rasa/cli/arguments/train.py +263 -0
  23. rasa/cli/arguments/visualize.py +34 -0
  24. rasa/cli/arguments/x.py +30 -0
  25. rasa/cli/data.py +292 -0
  26. rasa/cli/e2e_test.py +566 -0
  27. rasa/cli/evaluate.py +222 -0
  28. rasa/cli/export.py +251 -0
  29. rasa/cli/inspect.py +63 -0
  30. rasa/cli/interactive.py +164 -0
  31. rasa/cli/license.py +65 -0
  32. rasa/cli/markers.py +78 -0
  33. rasa/cli/project_templates/__init__.py +0 -0
  34. rasa/cli/project_templates/calm/actions/__init__.py +0 -0
  35. rasa/cli/project_templates/calm/actions/action_template.py +27 -0
  36. rasa/cli/project_templates/calm/actions/add_contact.py +30 -0
  37. rasa/cli/project_templates/calm/actions/db.py +57 -0
  38. rasa/cli/project_templates/calm/actions/list_contacts.py +22 -0
  39. rasa/cli/project_templates/calm/actions/remove_contact.py +35 -0
  40. rasa/cli/project_templates/calm/config.yml +12 -0
  41. rasa/cli/project_templates/calm/credentials.yml +33 -0
  42. rasa/cli/project_templates/calm/data/flows/add_contact.yml +31 -0
  43. rasa/cli/project_templates/calm/data/flows/list_contacts.yml +14 -0
  44. rasa/cli/project_templates/calm/data/flows/remove_contact.yml +29 -0
  45. rasa/cli/project_templates/calm/db/contacts.json +10 -0
  46. rasa/cli/project_templates/calm/domain/add_contact.yml +33 -0
  47. rasa/cli/project_templates/calm/domain/list_contacts.yml +14 -0
  48. rasa/cli/project_templates/calm/domain/remove_contact.yml +31 -0
  49. rasa/cli/project_templates/calm/domain/shared.yml +5 -0
  50. rasa/cli/project_templates/calm/e2e_tests/cancelations/user_cancels_during_a_correction.yml +16 -0
  51. rasa/cli/project_templates/calm/e2e_tests/cancelations/user_changes_mind_on_a_whim.yml +7 -0
  52. rasa/cli/project_templates/calm/e2e_tests/corrections/user_corrects_contact_handle.yml +20 -0
  53. rasa/cli/project_templates/calm/e2e_tests/corrections/user_corrects_contact_name.yml +19 -0
  54. rasa/cli/project_templates/calm/e2e_tests/happy_paths/user_adds_contact_to_their_list.yml +15 -0
  55. rasa/cli/project_templates/calm/e2e_tests/happy_paths/user_lists_contacts.yml +5 -0
  56. rasa/cli/project_templates/calm/e2e_tests/happy_paths/user_removes_contact.yml +11 -0
  57. rasa/cli/project_templates/calm/e2e_tests/happy_paths/user_removes_contact_from_list.yml +12 -0
  58. rasa/cli/project_templates/calm/endpoints.yml +45 -0
  59. rasa/cli/project_templates/default/actions/__init__.py +0 -0
  60. rasa/cli/project_templates/default/actions/actions.py +27 -0
  61. rasa/cli/project_templates/default/config.yml +44 -0
  62. rasa/cli/project_templates/default/credentials.yml +33 -0
  63. rasa/cli/project_templates/default/data/nlu.yml +91 -0
  64. rasa/cli/project_templates/default/data/rules.yml +13 -0
  65. rasa/cli/project_templates/default/data/stories.yml +30 -0
  66. rasa/cli/project_templates/default/domain.yml +34 -0
  67. rasa/cli/project_templates/default/endpoints.yml +42 -0
  68. rasa/cli/project_templates/default/tests/test_stories.yml +91 -0
  69. rasa/cli/project_templates/tutorial/actions.py +22 -0
  70. rasa/cli/project_templates/tutorial/config.yml +11 -0
  71. rasa/cli/project_templates/tutorial/credentials.yml +33 -0
  72. rasa/cli/project_templates/tutorial/data/flows.yml +8 -0
  73. rasa/cli/project_templates/tutorial/domain.yml +17 -0
  74. rasa/cli/project_templates/tutorial/endpoints.yml +45 -0
  75. rasa/cli/run.py +136 -0
  76. rasa/cli/scaffold.py +268 -0
  77. rasa/cli/shell.py +141 -0
  78. rasa/cli/studio/__init__.py +0 -0
  79. rasa/cli/studio/download.py +51 -0
  80. rasa/cli/studio/studio.py +110 -0
  81. rasa/cli/studio/train.py +59 -0
  82. rasa/cli/studio/upload.py +85 -0
  83. rasa/cli/telemetry.py +90 -0
  84. rasa/cli/test.py +280 -0
  85. rasa/cli/train.py +260 -0
  86. rasa/cli/utils.py +453 -0
  87. rasa/cli/visualize.py +40 -0
  88. rasa/cli/x.py +205 -0
  89. rasa/constants.py +37 -0
  90. rasa/core/__init__.py +17 -0
  91. rasa/core/actions/__init__.py +0 -0
  92. rasa/core/actions/action.py +1450 -0
  93. rasa/core/actions/action_clean_stack.py +59 -0
  94. rasa/core/actions/action_run_slot_rejections.py +207 -0
  95. rasa/core/actions/action_trigger_chitchat.py +31 -0
  96. rasa/core/actions/action_trigger_flow.py +109 -0
  97. rasa/core/actions/action_trigger_search.py +31 -0
  98. rasa/core/actions/constants.py +2 -0
  99. rasa/core/actions/forms.py +737 -0
  100. rasa/core/actions/loops.py +111 -0
  101. rasa/core/actions/two_stage_fallback.py +186 -0
  102. rasa/core/agent.py +557 -0
  103. rasa/core/auth_retry_tracker_store.py +122 -0
  104. rasa/core/brokers/__init__.py +0 -0
  105. rasa/core/brokers/broker.py +126 -0
  106. rasa/core/brokers/file.py +58 -0
  107. rasa/core/brokers/kafka.py +322 -0
  108. rasa/core/brokers/pika.py +387 -0
  109. rasa/core/brokers/sql.py +86 -0
  110. rasa/core/channels/__init__.py +55 -0
  111. rasa/core/channels/audiocodes.py +463 -0
  112. rasa/core/channels/botframework.py +339 -0
  113. rasa/core/channels/callback.py +85 -0
  114. rasa/core/channels/channel.py +419 -0
  115. rasa/core/channels/console.py +243 -0
  116. rasa/core/channels/development_inspector.py +93 -0
  117. rasa/core/channels/facebook.py +422 -0
  118. rasa/core/channels/hangouts.py +335 -0
  119. rasa/core/channels/inspector/.eslintrc.cjs +25 -0
  120. rasa/core/channels/inspector/.gitignore +23 -0
  121. rasa/core/channels/inspector/README.md +54 -0
  122. rasa/core/channels/inspector/assets/favicon.ico +0 -0
  123. rasa/core/channels/inspector/assets/rasa-chat.js +2 -0
  124. rasa/core/channels/inspector/custom.d.ts +3 -0
  125. rasa/core/channels/inspector/dist/assets/arc-5623b6dc.js +1 -0
  126. rasa/core/channels/inspector/dist/assets/array-9f3ba611.js +1 -0
  127. rasa/core/channels/inspector/dist/assets/c4Diagram-d0fbc5ce-685c106a.js +10 -0
  128. rasa/core/channels/inspector/dist/assets/classDiagram-936ed81e-8cbed007.js +2 -0
  129. rasa/core/channels/inspector/dist/assets/classDiagram-v2-c3cb15f1-5889cf12.js +2 -0
  130. rasa/core/channels/inspector/dist/assets/createText-62fc7601-24c249d7.js +7 -0
  131. rasa/core/channels/inspector/dist/assets/edges-f2ad444c-7dd06a75.js +4 -0
  132. rasa/core/channels/inspector/dist/assets/erDiagram-9d236eb7-62c1e54c.js +51 -0
  133. rasa/core/channels/inspector/dist/assets/flowDb-1972c806-ce49b86f.js +6 -0
  134. rasa/core/channels/inspector/dist/assets/flowDiagram-7ea5b25a-4067e48f.js +4 -0
  135. rasa/core/channels/inspector/dist/assets/flowDiagram-v2-855bc5b3-85583a23.js +1 -0
  136. rasa/core/channels/inspector/dist/assets/flowchart-elk-definition-abe16c3d-59fe4051.js +139 -0
  137. rasa/core/channels/inspector/dist/assets/ganttDiagram-9b5ea136-47e3a43b.js +266 -0
  138. rasa/core/channels/inspector/dist/assets/gitGraphDiagram-99d0ae7c-5a2ac0d9.js +70 -0
  139. rasa/core/channels/inspector/dist/assets/ibm-plex-mono-v4-latin-regular-128cfa44.ttf +0 -0
  140. rasa/core/channels/inspector/dist/assets/ibm-plex-mono-v4-latin-regular-21dbcb97.woff +0 -0
  141. rasa/core/channels/inspector/dist/assets/ibm-plex-mono-v4-latin-regular-222b5e26.svg +329 -0
  142. rasa/core/channels/inspector/dist/assets/ibm-plex-mono-v4-latin-regular-9ad89b2a.woff2 +0 -0
  143. rasa/core/channels/inspector/dist/assets/index-268a75c0.js +1040 -0
  144. rasa/core/channels/inspector/dist/assets/index-2c4b9a3b-dfb8efc4.js +1 -0
  145. rasa/core/channels/inspector/dist/assets/index-3ee28881.css +1 -0
  146. rasa/core/channels/inspector/dist/assets/infoDiagram-736b4530-b0c470f2.js +7 -0
  147. rasa/core/channels/inspector/dist/assets/init-77b53fdd.js +1 -0
  148. rasa/core/channels/inspector/dist/assets/journeyDiagram-df861f2b-2edb829a.js +139 -0
  149. rasa/core/channels/inspector/dist/assets/lato-v14-latin-700-60c05ee4.woff +0 -0
  150. rasa/core/channels/inspector/dist/assets/lato-v14-latin-700-8335d9b8.svg +438 -0
  151. rasa/core/channels/inspector/dist/assets/lato-v14-latin-700-9cc39c75.ttf +0 -0
  152. rasa/core/channels/inspector/dist/assets/lato-v14-latin-700-ead13ccf.woff2 +0 -0
  153. rasa/core/channels/inspector/dist/assets/lato-v14-latin-regular-16705655.woff2 +0 -0
  154. rasa/core/channels/inspector/dist/assets/lato-v14-latin-regular-5aeb07f9.woff +0 -0
  155. rasa/core/channels/inspector/dist/assets/lato-v14-latin-regular-9c459044.ttf +0 -0
  156. rasa/core/channels/inspector/dist/assets/lato-v14-latin-regular-9e2898a4.svg +435 -0
  157. rasa/core/channels/inspector/dist/assets/layout-b6873d69.js +1 -0
  158. rasa/core/channels/inspector/dist/assets/line-1efc5781.js +1 -0
  159. rasa/core/channels/inspector/dist/assets/linear-661e9b94.js +1 -0
  160. rasa/core/channels/inspector/dist/assets/mindmap-definition-beec6740-2d2e727f.js +109 -0
  161. rasa/core/channels/inspector/dist/assets/ordinal-ba9b4969.js +1 -0
  162. rasa/core/channels/inspector/dist/assets/path-53f90ab3.js +1 -0
  163. rasa/core/channels/inspector/dist/assets/pieDiagram-dbbf0591-9d3ea93d.js +35 -0
  164. rasa/core/channels/inspector/dist/assets/quadrantDiagram-4d7f4fd6-06a178a2.js +7 -0
  165. rasa/core/channels/inspector/dist/assets/requirementDiagram-6fc4c22a-0bfedffc.js +52 -0
  166. rasa/core/channels/inspector/dist/assets/sankeyDiagram-8f13d901-d76d0a04.js +8 -0
  167. rasa/core/channels/inspector/dist/assets/sequenceDiagram-b655622a-37bb4341.js +122 -0
  168. rasa/core/channels/inspector/dist/assets/stateDiagram-59f0c015-f52f7f57.js +1 -0
  169. rasa/core/channels/inspector/dist/assets/stateDiagram-v2-2b26beab-4a986a20.js +1 -0
  170. rasa/core/channels/inspector/dist/assets/styles-080da4f6-7dd9ae12.js +110 -0
  171. rasa/core/channels/inspector/dist/assets/styles-3dcbcfbf-46e1ca14.js +159 -0
  172. rasa/core/channels/inspector/dist/assets/styles-9c745c82-4a97439a.js +207 -0
  173. rasa/core/channels/inspector/dist/assets/svgDrawCommon-4835440b-823917a3.js +1 -0
  174. rasa/core/channels/inspector/dist/assets/timeline-definition-5b62e21b-9ea72896.js +61 -0
  175. rasa/core/channels/inspector/dist/assets/xychartDiagram-2b33534f-b631a8b6.js +7 -0
  176. rasa/core/channels/inspector/dist/index.html +39 -0
  177. rasa/core/channels/inspector/index.html +37 -0
  178. rasa/core/channels/inspector/jest.config.ts +13 -0
  179. rasa/core/channels/inspector/package.json +48 -0
  180. rasa/core/channels/inspector/setupTests.ts +2 -0
  181. rasa/core/channels/inspector/src/App.tsx +170 -0
  182. rasa/core/channels/inspector/src/components/DiagramFlow.tsx +97 -0
  183. rasa/core/channels/inspector/src/components/DialogueInformation.tsx +187 -0
  184. rasa/core/channels/inspector/src/components/DialogueStack.tsx +151 -0
  185. rasa/core/channels/inspector/src/components/ExpandIcon.tsx +16 -0
  186. rasa/core/channels/inspector/src/components/FullscreenButton.tsx +45 -0
  187. rasa/core/channels/inspector/src/components/LoadingSpinner.tsx +19 -0
  188. rasa/core/channels/inspector/src/components/NoActiveFlow.tsx +21 -0
  189. rasa/core/channels/inspector/src/components/RasaLogo.tsx +32 -0
  190. rasa/core/channels/inspector/src/components/SaraDiagrams.tsx +39 -0
  191. rasa/core/channels/inspector/src/components/Slots.tsx +91 -0
  192. rasa/core/channels/inspector/src/components/Welcome.tsx +54 -0
  193. rasa/core/channels/inspector/src/helpers/formatters.test.ts +385 -0
  194. rasa/core/channels/inspector/src/helpers/formatters.ts +239 -0
  195. rasa/core/channels/inspector/src/helpers/utils.ts +42 -0
  196. rasa/core/channels/inspector/src/main.tsx +13 -0
  197. rasa/core/channels/inspector/src/theme/Button/Button.ts +29 -0
  198. rasa/core/channels/inspector/src/theme/Heading/Heading.ts +31 -0
  199. rasa/core/channels/inspector/src/theme/Input/Input.ts +27 -0
  200. rasa/core/channels/inspector/src/theme/Link/Link.ts +10 -0
  201. rasa/core/channels/inspector/src/theme/Modal/Modal.ts +47 -0
  202. rasa/core/channels/inspector/src/theme/Table/Table.tsx +38 -0
  203. rasa/core/channels/inspector/src/theme/Tooltip/Tooltip.ts +12 -0
  204. rasa/core/channels/inspector/src/theme/base/breakpoints.ts +8 -0
  205. rasa/core/channels/inspector/src/theme/base/colors.ts +88 -0
  206. rasa/core/channels/inspector/src/theme/base/fonts/fontFaces.css +29 -0
  207. rasa/core/channels/inspector/src/theme/base/fonts/ibm-plex-mono-v4-latin/ibm-plex-mono-v4-latin-regular.eot +0 -0
  208. rasa/core/channels/inspector/src/theme/base/fonts/ibm-plex-mono-v4-latin/ibm-plex-mono-v4-latin-regular.svg +329 -0
  209. rasa/core/channels/inspector/src/theme/base/fonts/ibm-plex-mono-v4-latin/ibm-plex-mono-v4-latin-regular.ttf +0 -0
  210. rasa/core/channels/inspector/src/theme/base/fonts/ibm-plex-mono-v4-latin/ibm-plex-mono-v4-latin-regular.woff +0 -0
  211. rasa/core/channels/inspector/src/theme/base/fonts/ibm-plex-mono-v4-latin/ibm-plex-mono-v4-latin-regular.woff2 +0 -0
  212. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-700.eot +0 -0
  213. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-700.svg +438 -0
  214. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-700.ttf +0 -0
  215. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-700.woff +0 -0
  216. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-700.woff2 +0 -0
  217. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-regular.eot +0 -0
  218. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-regular.svg +435 -0
  219. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-regular.ttf +0 -0
  220. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-regular.woff +0 -0
  221. rasa/core/channels/inspector/src/theme/base/fonts/lato-v14-latin/lato-v14-latin-regular.woff2 +0 -0
  222. rasa/core/channels/inspector/src/theme/base/radii.ts +9 -0
  223. rasa/core/channels/inspector/src/theme/base/shadows.ts +7 -0
  224. rasa/core/channels/inspector/src/theme/base/sizes.ts +7 -0
  225. rasa/core/channels/inspector/src/theme/base/space.ts +15 -0
  226. rasa/core/channels/inspector/src/theme/base/styles.ts +13 -0
  227. rasa/core/channels/inspector/src/theme/base/typography.ts +24 -0
  228. rasa/core/channels/inspector/src/theme/base/zIndices.ts +19 -0
  229. rasa/core/channels/inspector/src/theme/index.ts +101 -0
  230. rasa/core/channels/inspector/src/types.ts +64 -0
  231. rasa/core/channels/inspector/src/vite-env.d.ts +1 -0
  232. rasa/core/channels/inspector/tests/__mocks__/fileMock.ts +1 -0
  233. rasa/core/channels/inspector/tests/__mocks__/matchMedia.ts +16 -0
  234. rasa/core/channels/inspector/tests/__mocks__/styleMock.ts +1 -0
  235. rasa/core/channels/inspector/tests/renderWithProviders.tsx +14 -0
  236. rasa/core/channels/inspector/tsconfig.json +26 -0
  237. rasa/core/channels/inspector/tsconfig.node.json +10 -0
  238. rasa/core/channels/inspector/vite.config.ts +8 -0
  239. rasa/core/channels/inspector/yarn.lock +6156 -0
  240. rasa/core/channels/mattermost.py +229 -0
  241. rasa/core/channels/rasa_chat.py +126 -0
  242. rasa/core/channels/rest.py +210 -0
  243. rasa/core/channels/rocketchat.py +175 -0
  244. rasa/core/channels/slack.py +620 -0
  245. rasa/core/channels/socketio.py +274 -0
  246. rasa/core/channels/telegram.py +298 -0
  247. rasa/core/channels/twilio.py +169 -0
  248. rasa/core/channels/twilio_voice.py +367 -0
  249. rasa/core/channels/vier_cvg.py +374 -0
  250. rasa/core/channels/webexteams.py +135 -0
  251. rasa/core/concurrent_lock_store.py +210 -0
  252. rasa/core/constants.py +107 -0
  253. rasa/core/evaluation/__init__.py +0 -0
  254. rasa/core/evaluation/marker.py +267 -0
  255. rasa/core/evaluation/marker_base.py +925 -0
  256. rasa/core/evaluation/marker_stats.py +294 -0
  257. rasa/core/evaluation/marker_tracker_loader.py +103 -0
  258. rasa/core/exceptions.py +29 -0
  259. rasa/core/exporter.py +284 -0
  260. rasa/core/featurizers/__init__.py +0 -0
  261. rasa/core/featurizers/precomputation.py +410 -0
  262. rasa/core/featurizers/single_state_featurizer.py +402 -0
  263. rasa/core/featurizers/tracker_featurizers.py +1172 -0
  264. rasa/core/http_interpreter.py +89 -0
  265. rasa/core/information_retrieval/__init__.py +0 -0
  266. rasa/core/information_retrieval/faiss.py +116 -0
  267. rasa/core/information_retrieval/information_retrieval.py +72 -0
  268. rasa/core/information_retrieval/milvus.py +59 -0
  269. rasa/core/information_retrieval/qdrant.py +102 -0
  270. rasa/core/jobs.py +63 -0
  271. rasa/core/lock.py +139 -0
  272. rasa/core/lock_store.py +344 -0
  273. rasa/core/migrate.py +404 -0
  274. rasa/core/nlg/__init__.py +3 -0
  275. rasa/core/nlg/callback.py +147 -0
  276. rasa/core/nlg/contextual_response_rephraser.py +270 -0
  277. rasa/core/nlg/generator.py +230 -0
  278. rasa/core/nlg/interpolator.py +143 -0
  279. rasa/core/nlg/response.py +155 -0
  280. rasa/core/nlg/summarize.py +69 -0
  281. rasa/core/policies/__init__.py +0 -0
  282. rasa/core/policies/ensemble.py +329 -0
  283. rasa/core/policies/enterprise_search_policy.py +717 -0
  284. rasa/core/policies/enterprise_search_prompt_template.jinja2 +62 -0
  285. rasa/core/policies/flow_policy.py +205 -0
  286. rasa/core/policies/flows/__init__.py +0 -0
  287. rasa/core/policies/flows/flow_exceptions.py +44 -0
  288. rasa/core/policies/flows/flow_executor.py +582 -0
  289. rasa/core/policies/flows/flow_step_result.py +43 -0
  290. rasa/core/policies/intentless_policy.py +924 -0
  291. rasa/core/policies/intentless_prompt_template.jinja2 +22 -0
  292. rasa/core/policies/memoization.py +538 -0
  293. rasa/core/policies/policy.py +716 -0
  294. rasa/core/policies/rule_policy.py +1276 -0
  295. rasa/core/policies/ted_policy.py +2146 -0
  296. rasa/core/policies/unexpected_intent_policy.py +1015 -0
  297. rasa/core/processor.py +1331 -0
  298. rasa/core/run.py +315 -0
  299. rasa/core/secrets_manager/__init__.py +0 -0
  300. rasa/core/secrets_manager/constants.py +32 -0
  301. rasa/core/secrets_manager/endpoints.py +391 -0
  302. rasa/core/secrets_manager/factory.py +233 -0
  303. rasa/core/secrets_manager/secret_manager.py +262 -0
  304. rasa/core/secrets_manager/vault.py +576 -0
  305. rasa/core/test.py +1337 -0
  306. rasa/core/tracker_store.py +1664 -0
  307. rasa/core/train.py +107 -0
  308. rasa/core/training/__init__.py +89 -0
  309. rasa/core/training/converters/__init__.py +0 -0
  310. rasa/core/training/converters/responses_prefix_converter.py +119 -0
  311. rasa/core/training/interactive.py +1742 -0
  312. rasa/core/training/story_conflict.py +381 -0
  313. rasa/core/training/training.py +93 -0
  314. rasa/core/utils.py +344 -0
  315. rasa/core/visualize.py +70 -0
  316. rasa/dialogue_understanding/__init__.py +0 -0
  317. rasa/dialogue_understanding/coexistence/__init__.py +0 -0
  318. rasa/dialogue_understanding/coexistence/constants.py +4 -0
  319. rasa/dialogue_understanding/coexistence/intent_based_router.py +189 -0
  320. rasa/dialogue_understanding/coexistence/llm_based_router.py +261 -0
  321. rasa/dialogue_understanding/coexistence/router_template.jinja2 +12 -0
  322. rasa/dialogue_understanding/commands/__init__.py +45 -0
  323. rasa/dialogue_understanding/commands/can_not_handle_command.py +61 -0
  324. rasa/dialogue_understanding/commands/cancel_flow_command.py +116 -0
  325. rasa/dialogue_understanding/commands/chit_chat_answer_command.py +48 -0
  326. rasa/dialogue_understanding/commands/clarify_command.py +77 -0
  327. rasa/dialogue_understanding/commands/command.py +85 -0
  328. rasa/dialogue_understanding/commands/correct_slots_command.py +288 -0
  329. rasa/dialogue_understanding/commands/error_command.py +67 -0
  330. rasa/dialogue_understanding/commands/free_form_answer_command.py +9 -0
  331. rasa/dialogue_understanding/commands/handle_code_change_command.py +64 -0
  332. rasa/dialogue_understanding/commands/human_handoff_command.py +57 -0
  333. rasa/dialogue_understanding/commands/knowledge_answer_command.py +48 -0
  334. rasa/dialogue_understanding/commands/noop_command.py +45 -0
  335. rasa/dialogue_understanding/commands/set_slot_command.py +125 -0
  336. rasa/dialogue_understanding/commands/skip_question_command.py +66 -0
  337. rasa/dialogue_understanding/commands/start_flow_command.py +98 -0
  338. rasa/dialogue_understanding/generator/__init__.py +6 -0
  339. rasa/dialogue_understanding/generator/command_generator.py +257 -0
  340. rasa/dialogue_understanding/generator/command_prompt_template.jinja2 +57 -0
  341. rasa/dialogue_understanding/generator/flow_document_template.jinja2 +4 -0
  342. rasa/dialogue_understanding/generator/flow_retrieval.py +410 -0
  343. rasa/dialogue_understanding/generator/llm_command_generator.py +637 -0
  344. rasa/dialogue_understanding/generator/nlu_command_adapter.py +157 -0
  345. rasa/dialogue_understanding/patterns/__init__.py +0 -0
  346. rasa/dialogue_understanding/patterns/cancel.py +111 -0
  347. rasa/dialogue_understanding/patterns/cannot_handle.py +43 -0
  348. rasa/dialogue_understanding/patterns/chitchat.py +37 -0
  349. rasa/dialogue_understanding/patterns/clarify.py +97 -0
  350. rasa/dialogue_understanding/patterns/code_change.py +41 -0
  351. rasa/dialogue_understanding/patterns/collect_information.py +90 -0
  352. rasa/dialogue_understanding/patterns/completed.py +40 -0
  353. rasa/dialogue_understanding/patterns/continue_interrupted.py +42 -0
  354. rasa/dialogue_understanding/patterns/correction.py +278 -0
  355. rasa/dialogue_understanding/patterns/default_flows_for_patterns.yml +243 -0
  356. rasa/dialogue_understanding/patterns/human_handoff.py +37 -0
  357. rasa/dialogue_understanding/patterns/internal_error.py +47 -0
  358. rasa/dialogue_understanding/patterns/search.py +37 -0
  359. rasa/dialogue_understanding/patterns/skip_question.py +38 -0
  360. rasa/dialogue_understanding/processor/__init__.py +0 -0
  361. rasa/dialogue_understanding/processor/command_processor.py +578 -0
  362. rasa/dialogue_understanding/processor/command_processor_component.py +39 -0
  363. rasa/dialogue_understanding/stack/__init__.py +0 -0
  364. rasa/dialogue_understanding/stack/dialogue_stack.py +178 -0
  365. rasa/dialogue_understanding/stack/frames/__init__.py +19 -0
  366. rasa/dialogue_understanding/stack/frames/chit_chat_frame.py +27 -0
  367. rasa/dialogue_understanding/stack/frames/dialogue_stack_frame.py +137 -0
  368. rasa/dialogue_understanding/stack/frames/flow_stack_frame.py +157 -0
  369. rasa/dialogue_understanding/stack/frames/pattern_frame.py +10 -0
  370. rasa/dialogue_understanding/stack/frames/search_frame.py +27 -0
  371. rasa/dialogue_understanding/stack/utils.py +211 -0
  372. rasa/e2e_test/__init__.py +0 -0
  373. rasa/e2e_test/constants.py +10 -0
  374. rasa/e2e_test/e2e_test_case.py +322 -0
  375. rasa/e2e_test/e2e_test_result.py +34 -0
  376. rasa/e2e_test/e2e_test_runner.py +659 -0
  377. rasa/e2e_test/e2e_test_schema.yml +67 -0
  378. rasa/engine/__init__.py +0 -0
  379. rasa/engine/caching.py +464 -0
  380. rasa/engine/constants.py +17 -0
  381. rasa/engine/exceptions.py +14 -0
  382. rasa/engine/graph.py +625 -0
  383. rasa/engine/loader.py +36 -0
  384. rasa/engine/recipes/__init__.py +0 -0
  385. rasa/engine/recipes/config_files/default_config.yml +44 -0
  386. rasa/engine/recipes/default_components.py +99 -0
  387. rasa/engine/recipes/default_recipe.py +1252 -0
  388. rasa/engine/recipes/graph_recipe.py +79 -0
  389. rasa/engine/recipes/recipe.py +93 -0
  390. rasa/engine/runner/__init__.py +0 -0
  391. rasa/engine/runner/dask.py +256 -0
  392. rasa/engine/runner/interface.py +49 -0
  393. rasa/engine/storage/__init__.py +0 -0
  394. rasa/engine/storage/local_model_storage.py +248 -0
  395. rasa/engine/storage/resource.py +110 -0
  396. rasa/engine/storage/storage.py +203 -0
  397. rasa/engine/training/__init__.py +0 -0
  398. rasa/engine/training/components.py +176 -0
  399. rasa/engine/training/fingerprinting.py +64 -0
  400. rasa/engine/training/graph_trainer.py +256 -0
  401. rasa/engine/training/hooks.py +164 -0
  402. rasa/engine/validation.py +839 -0
  403. rasa/env.py +5 -0
  404. rasa/exceptions.py +69 -0
  405. rasa/graph_components/__init__.py +0 -0
  406. rasa/graph_components/converters/__init__.py +0 -0
  407. rasa/graph_components/converters/nlu_message_converter.py +48 -0
  408. rasa/graph_components/providers/__init__.py +0 -0
  409. rasa/graph_components/providers/domain_for_core_training_provider.py +87 -0
  410. rasa/graph_components/providers/domain_provider.py +71 -0
  411. rasa/graph_components/providers/flows_provider.py +74 -0
  412. rasa/graph_components/providers/forms_provider.py +44 -0
  413. rasa/graph_components/providers/nlu_training_data_provider.py +56 -0
  414. rasa/graph_components/providers/responses_provider.py +44 -0
  415. rasa/graph_components/providers/rule_only_provider.py +49 -0
  416. rasa/graph_components/providers/story_graph_provider.py +43 -0
  417. rasa/graph_components/providers/training_tracker_provider.py +55 -0
  418. rasa/graph_components/validators/__init__.py +0 -0
  419. rasa/graph_components/validators/default_recipe_validator.py +552 -0
  420. rasa/graph_components/validators/finetuning_validator.py +302 -0
  421. rasa/hooks.py +113 -0
  422. rasa/jupyter.py +63 -0
  423. rasa/keys +1 -0
  424. rasa/markers/__init__.py +0 -0
  425. rasa/markers/marker.py +269 -0
  426. rasa/markers/marker_base.py +828 -0
  427. rasa/markers/upload.py +74 -0
  428. rasa/markers/validate.py +21 -0
  429. rasa/model.py +118 -0
  430. rasa/model_testing.py +457 -0
  431. rasa/model_training.py +535 -0
  432. rasa/nlu/__init__.py +7 -0
  433. rasa/nlu/classifiers/__init__.py +3 -0
  434. rasa/nlu/classifiers/classifier.py +5 -0
  435. rasa/nlu/classifiers/diet_classifier.py +1874 -0
  436. rasa/nlu/classifiers/fallback_classifier.py +192 -0
  437. rasa/nlu/classifiers/keyword_intent_classifier.py +188 -0
  438. rasa/nlu/classifiers/llm_intent_classifier.py +519 -0
  439. rasa/nlu/classifiers/logistic_regression_classifier.py +240 -0
  440. rasa/nlu/classifiers/mitie_intent_classifier.py +156 -0
  441. rasa/nlu/classifiers/regex_message_handler.py +56 -0
  442. rasa/nlu/classifiers/sklearn_intent_classifier.py +309 -0
  443. rasa/nlu/constants.py +77 -0
  444. rasa/nlu/convert.py +40 -0
  445. rasa/nlu/emulators/__init__.py +0 -0
  446. rasa/nlu/emulators/dialogflow.py +55 -0
  447. rasa/nlu/emulators/emulator.py +49 -0
  448. rasa/nlu/emulators/luis.py +86 -0
  449. rasa/nlu/emulators/no_emulator.py +10 -0
  450. rasa/nlu/emulators/wit.py +56 -0
  451. rasa/nlu/extractors/__init__.py +0 -0
  452. rasa/nlu/extractors/crf_entity_extractor.py +672 -0
  453. rasa/nlu/extractors/duckling_entity_extractor.py +206 -0
  454. rasa/nlu/extractors/entity_synonyms.py +178 -0
  455. rasa/nlu/extractors/extractor.py +470 -0
  456. rasa/nlu/extractors/mitie_entity_extractor.py +293 -0
  457. rasa/nlu/extractors/regex_entity_extractor.py +220 -0
  458. rasa/nlu/extractors/spacy_entity_extractor.py +95 -0
  459. rasa/nlu/featurizers/__init__.py +0 -0
  460. rasa/nlu/featurizers/dense_featurizer/__init__.py +0 -0
  461. rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +449 -0
  462. rasa/nlu/featurizers/dense_featurizer/dense_featurizer.py +57 -0
  463. rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +772 -0
  464. rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +170 -0
  465. rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +132 -0
  466. rasa/nlu/featurizers/featurizer.py +89 -0
  467. rasa/nlu/featurizers/sparse_featurizer/__init__.py +0 -0
  468. rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +840 -0
  469. rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +539 -0
  470. rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +269 -0
  471. rasa/nlu/featurizers/sparse_featurizer/sparse_featurizer.py +9 -0
  472. rasa/nlu/model.py +24 -0
  473. rasa/nlu/persistor.py +240 -0
  474. rasa/nlu/run.py +27 -0
  475. rasa/nlu/selectors/__init__.py +0 -0
  476. rasa/nlu/selectors/response_selector.py +990 -0
  477. rasa/nlu/test.py +1943 -0
  478. rasa/nlu/tokenizers/__init__.py +0 -0
  479. rasa/nlu/tokenizers/jieba_tokenizer.py +148 -0
  480. rasa/nlu/tokenizers/mitie_tokenizer.py +75 -0
  481. rasa/nlu/tokenizers/spacy_tokenizer.py +72 -0
  482. rasa/nlu/tokenizers/tokenizer.py +239 -0
  483. rasa/nlu/tokenizers/whitespace_tokenizer.py +106 -0
  484. rasa/nlu/utils/__init__.py +35 -0
  485. rasa/nlu/utils/bilou_utils.py +462 -0
  486. rasa/nlu/utils/hugging_face/__init__.py +0 -0
  487. rasa/nlu/utils/hugging_face/registry.py +108 -0
  488. rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py +311 -0
  489. rasa/nlu/utils/mitie_utils.py +113 -0
  490. rasa/nlu/utils/pattern_utils.py +168 -0
  491. rasa/nlu/utils/spacy_utils.py +312 -0
  492. rasa/plugin.py +90 -0
  493. rasa/server.py +1536 -0
  494. rasa/shared/__init__.py +0 -0
  495. rasa/shared/constants.py +181 -0
  496. rasa/shared/core/__init__.py +0 -0
  497. rasa/shared/core/constants.py +168 -0
  498. rasa/shared/core/conversation.py +46 -0
  499. rasa/shared/core/domain.py +2106 -0
  500. rasa/shared/core/events.py +2507 -0
  501. rasa/shared/core/flows/__init__.py +7 -0
  502. rasa/shared/core/flows/flow.py +353 -0
  503. rasa/shared/core/flows/flow_step.py +146 -0
  504. rasa/shared/core/flows/flow_step_links.py +319 -0
  505. rasa/shared/core/flows/flow_step_sequence.py +70 -0
  506. rasa/shared/core/flows/flows_list.py +211 -0
  507. rasa/shared/core/flows/flows_yaml_schema.json +217 -0
  508. rasa/shared/core/flows/nlu_trigger.py +117 -0
  509. rasa/shared/core/flows/steps/__init__.py +24 -0
  510. rasa/shared/core/flows/steps/action.py +51 -0
  511. rasa/shared/core/flows/steps/call.py +64 -0
  512. rasa/shared/core/flows/steps/collect.py +112 -0
  513. rasa/shared/core/flows/steps/constants.py +5 -0
  514. rasa/shared/core/flows/steps/continuation.py +36 -0
  515. rasa/shared/core/flows/steps/end.py +22 -0
  516. rasa/shared/core/flows/steps/internal.py +44 -0
  517. rasa/shared/core/flows/steps/link.py +51 -0
  518. rasa/shared/core/flows/steps/no_operation.py +48 -0
  519. rasa/shared/core/flows/steps/set_slots.py +50 -0
  520. rasa/shared/core/flows/steps/start.py +30 -0
  521. rasa/shared/core/flows/validation.py +527 -0
  522. rasa/shared/core/flows/yaml_flows_io.py +278 -0
  523. rasa/shared/core/generator.py +907 -0
  524. rasa/shared/core/slot_mappings.py +235 -0
  525. rasa/shared/core/slots.py +647 -0
  526. rasa/shared/core/trackers.py +1159 -0
  527. rasa/shared/core/training_data/__init__.py +0 -0
  528. rasa/shared/core/training_data/loading.py +90 -0
  529. rasa/shared/core/training_data/story_reader/__init__.py +0 -0
  530. rasa/shared/core/training_data/story_reader/story_reader.py +129 -0
  531. rasa/shared/core/training_data/story_reader/story_step_builder.py +168 -0
  532. rasa/shared/core/training_data/story_reader/yaml_story_reader.py +888 -0
  533. rasa/shared/core/training_data/story_writer/__init__.py +0 -0
  534. rasa/shared/core/training_data/story_writer/story_writer.py +76 -0
  535. rasa/shared/core/training_data/story_writer/yaml_story_writer.py +442 -0
  536. rasa/shared/core/training_data/structures.py +838 -0
  537. rasa/shared/core/training_data/visualization.html +146 -0
  538. rasa/shared/core/training_data/visualization.py +603 -0
  539. rasa/shared/data.py +192 -0
  540. rasa/shared/engine/__init__.py +0 -0
  541. rasa/shared/engine/caching.py +26 -0
  542. rasa/shared/exceptions.py +129 -0
  543. rasa/shared/importers/__init__.py +0 -0
  544. rasa/shared/importers/importer.py +705 -0
  545. rasa/shared/importers/multi_project.py +203 -0
  546. rasa/shared/importers/rasa.py +100 -0
  547. rasa/shared/importers/utils.py +34 -0
  548. rasa/shared/nlu/__init__.py +0 -0
  549. rasa/shared/nlu/constants.py +45 -0
  550. rasa/shared/nlu/interpreter.py +10 -0
  551. rasa/shared/nlu/training_data/__init__.py +0 -0
  552. rasa/shared/nlu/training_data/entities_parser.py +209 -0
  553. rasa/shared/nlu/training_data/features.py +374 -0
  554. rasa/shared/nlu/training_data/formats/__init__.py +10 -0
  555. rasa/shared/nlu/training_data/formats/dialogflow.py +162 -0
  556. rasa/shared/nlu/training_data/formats/luis.py +87 -0
  557. rasa/shared/nlu/training_data/formats/rasa.py +135 -0
  558. rasa/shared/nlu/training_data/formats/rasa_yaml.py +605 -0
  559. rasa/shared/nlu/training_data/formats/readerwriter.py +245 -0
  560. rasa/shared/nlu/training_data/formats/wit.py +52 -0
  561. rasa/shared/nlu/training_data/loading.py +137 -0
  562. rasa/shared/nlu/training_data/lookup_tables_parser.py +30 -0
  563. rasa/shared/nlu/training_data/message.py +477 -0
  564. rasa/shared/nlu/training_data/schemas/__init__.py +0 -0
  565. rasa/shared/nlu/training_data/schemas/data_schema.py +85 -0
  566. rasa/shared/nlu/training_data/schemas/nlu.yml +53 -0
  567. rasa/shared/nlu/training_data/schemas/responses.yml +70 -0
  568. rasa/shared/nlu/training_data/synonyms_parser.py +42 -0
  569. rasa/shared/nlu/training_data/training_data.py +732 -0
  570. rasa/shared/nlu/training_data/util.py +223 -0
  571. rasa/shared/providers/__init__.py +0 -0
  572. rasa/shared/providers/openai/__init__.py +0 -0
  573. rasa/shared/providers/openai/clients.py +43 -0
  574. rasa/shared/providers/openai/session_handler.py +110 -0
  575. rasa/shared/utils/__init__.py +0 -0
  576. rasa/shared/utils/cli.py +72 -0
  577. rasa/shared/utils/common.py +308 -0
  578. rasa/shared/utils/constants.py +1 -0
  579. rasa/shared/utils/io.py +403 -0
  580. rasa/shared/utils/llm.py +405 -0
  581. rasa/shared/utils/pykwalify_extensions.py +26 -0
  582. rasa/shared/utils/schemas/__init__.py +0 -0
  583. rasa/shared/utils/schemas/config.yml +2 -0
  584. rasa/shared/utils/schemas/domain.yml +142 -0
  585. rasa/shared/utils/schemas/events.py +212 -0
  586. rasa/shared/utils/schemas/model_config.yml +46 -0
  587. rasa/shared/utils/schemas/stories.yml +173 -0
  588. rasa/shared/utils/yaml.py +777 -0
  589. rasa/studio/__init__.py +0 -0
  590. rasa/studio/auth.py +252 -0
  591. rasa/studio/config.py +127 -0
  592. rasa/studio/constants.py +16 -0
  593. rasa/studio/data_handler.py +352 -0
  594. rasa/studio/download.py +350 -0
  595. rasa/studio/train.py +136 -0
  596. rasa/studio/upload.py +408 -0
  597. rasa/telemetry.py +1583 -0
  598. rasa/tracing/__init__.py +0 -0
  599. rasa/tracing/config.py +338 -0
  600. rasa/tracing/constants.py +38 -0
  601. rasa/tracing/instrumentation/__init__.py +0 -0
  602. rasa/tracing/instrumentation/attribute_extractors.py +663 -0
  603. rasa/tracing/instrumentation/instrumentation.py +939 -0
  604. rasa/tracing/instrumentation/intentless_policy_instrumentation.py +142 -0
  605. rasa/tracing/instrumentation/metrics.py +206 -0
  606. rasa/tracing/metric_instrument_provider.py +125 -0
  607. rasa/utils/__init__.py +0 -0
  608. rasa/utils/beta.py +83 -0
  609. rasa/utils/cli.py +27 -0
  610. rasa/utils/common.py +635 -0
  611. rasa/utils/converter.py +53 -0
  612. rasa/utils/endpoints.py +303 -0
  613. rasa/utils/io.py +326 -0
  614. rasa/utils/licensing.py +319 -0
  615. rasa/utils/log_utils.py +174 -0
  616. rasa/utils/mapper.py +210 -0
  617. rasa/utils/ml_utils.py +145 -0
  618. rasa/utils/plotting.py +362 -0
  619. rasa/utils/singleton.py +23 -0
  620. rasa/utils/tensorflow/__init__.py +0 -0
  621. rasa/utils/tensorflow/callback.py +112 -0
  622. rasa/utils/tensorflow/constants.py +116 -0
  623. rasa/utils/tensorflow/crf.py +492 -0
  624. rasa/utils/tensorflow/data_generator.py +440 -0
  625. rasa/utils/tensorflow/environment.py +161 -0
  626. rasa/utils/tensorflow/exceptions.py +5 -0
  627. rasa/utils/tensorflow/layers.py +1565 -0
  628. rasa/utils/tensorflow/layers_utils.py +113 -0
  629. rasa/utils/tensorflow/metrics.py +281 -0
  630. rasa/utils/tensorflow/model_data.py +991 -0
  631. rasa/utils/tensorflow/model_data_utils.py +500 -0
  632. rasa/utils/tensorflow/models.py +936 -0
  633. rasa/utils/tensorflow/rasa_layers.py +1094 -0
  634. rasa/utils/tensorflow/transformer.py +640 -0
  635. rasa/utils/tensorflow/types.py +6 -0
  636. rasa/utils/train_utils.py +572 -0
  637. rasa/utils/yaml.py +54 -0
  638. rasa/validator.py +1035 -0
  639. rasa/version.py +3 -0
  640. rasa_pro-3.8.16.dist-info/METADATA +528 -0
  641. rasa_pro-3.8.16.dist-info/NOTICE +5 -0
  642. rasa_pro-3.8.16.dist-info/RECORD +644 -0
  643. rasa_pro-3.8.16.dist-info/WHEEL +4 -0
  644. rasa_pro-3.8.16.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,840 @@
1
+ from __future__ import annotations
2
+ import logging
3
+ import re
4
+ import scipy.sparse
5
+ from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type
6
+ from rasa.nlu.tokenizers.tokenizer import Tokenizer
7
+
8
+ import rasa.shared.utils.io
9
+ from rasa.engine.graph import GraphComponent, ExecutionContext
10
+ from rasa.engine.recipes.default_recipe import DefaultV1Recipe
11
+ from rasa.engine.storage.resource import Resource
12
+ from rasa.engine.storage.storage import ModelStorage
13
+ from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
14
+ from rasa.nlu.utils.spacy_utils import SpacyModel
15
+ from rasa.shared.constants import DOCS_URL_COMPONENTS
16
+ import rasa.utils.io as io_utils
17
+ from sklearn.exceptions import NotFittedError
18
+ from sklearn.feature_extraction.text import CountVectorizer
19
+ from rasa.shared.nlu.training_data.training_data import TrainingData
20
+ from rasa.shared.nlu.training_data.message import Message
21
+ from rasa.shared.exceptions import RasaException, FileIOException
22
+ from rasa.nlu.constants import (
23
+ TOKENS_NAMES,
24
+ MESSAGE_ATTRIBUTES,
25
+ DENSE_FEATURIZABLE_ATTRIBUTES,
26
+ )
27
+ from rasa.shared.nlu.constants import TEXT, INTENT, INTENT_RESPONSE_KEY, ACTION_NAME
28
+
29
+ BUFFER_SLOTS_PREFIX = "buf_"
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ @DefaultV1Recipe.register(
35
+ DefaultV1Recipe.ComponentType.MESSAGE_FEATURIZER, is_trainable=True
36
+ )
37
+ class CountVectorsFeaturizer(SparseFeaturizer, GraphComponent):
38
+ """Creates a sequence of token counts features based on sklearn's `CountVectorizer`.
39
+
40
+ All tokens which consist only of digits (e.g. 123 and 99
41
+ but not ab12d) will be represented by a single feature.
42
+
43
+ Set `analyzer` to 'char_wb'
44
+ to use the idea of Subword Semantic Hashing
45
+ from https://arxiv.org/abs/1810.07150.
46
+ """
47
+
48
+ OOV_words: List[Text]
49
+
50
+ @classmethod
51
+ def required_components(cls) -> List[Type]:
52
+ """Components that should be included in the pipeline before this component."""
53
+ return [Tokenizer]
54
+
55
+ @staticmethod
56
+ def get_default_config() -> Dict[Text, Any]:
57
+ """Returns the component's default config."""
58
+ return {
59
+ **SparseFeaturizer.get_default_config(),
60
+ # whether to use a shared vocab
61
+ "use_shared_vocab": False,
62
+ # the parameters are taken from
63
+ # sklearn's CountVectorizer
64
+ # whether to use word or character n-grams
65
+ # 'char_wb' creates character n-grams inside word boundaries
66
+ # n-grams at the edges of words are padded with space.
67
+ "analyzer": "word", # use 'char' or 'char_wb' for character
68
+ # remove accents during the preprocessing step
69
+ "strip_accents": None, # {'ascii', 'unicode', None}
70
+ # list of stop words
71
+ "stop_words": None, # string {'english'}, list, or None (default)
72
+ # min document frequency of a word to add to vocabulary
73
+ # float - the parameter represents a proportion of documents
74
+ # integer - absolute counts
75
+ "min_df": 1, # float in range [0.0, 1.0] or int
76
+ # max document frequency of a word to add to vocabulary
77
+ # float - the parameter represents a proportion of documents
78
+ # integer - absolute counts
79
+ "max_df": 1.0, # float in range [0.0, 1.0] or int
80
+ # set range of ngrams to be extracted
81
+ "min_ngram": 1, # int
82
+ "max_ngram": 1, # int
83
+ # limit vocabulary size
84
+ "max_features": None, # int or None
85
+ # if convert all characters to lowercase
86
+ "lowercase": True, # bool
87
+ # handling Out-Of-Vocabulary (OOV) words
88
+ # will be converted to lowercase if lowercase is True
89
+ "OOV_token": None, # string or None
90
+ "OOV_words": [], # string or list of strings
91
+ # indicates whether the featurizer should use the lemma of a word for
92
+ # counting (if available) or not
93
+ "use_lemma": True,
94
+ }
95
+
96
+ @staticmethod
97
+ def required_packages() -> List[Text]:
98
+ """Any extra python dependencies required for this component to run."""
99
+ return ["sklearn"]
100
+
101
+ def _load_count_vect_params(self) -> None:
102
+
103
+ # Use shared vocabulary between text and all other attributes of Message
104
+ self.use_shared_vocab = self._config["use_shared_vocab"]
105
+
106
+ # set analyzer
107
+ self.analyzer = self._config["analyzer"]
108
+
109
+ # remove accents during the preprocessing step
110
+ self.strip_accents = self._config["strip_accents"]
111
+
112
+ # list of stop words
113
+ self.stop_words = self._config["stop_words"]
114
+
115
+ # min number of word occurancies in the document to add to vocabulary
116
+ self.min_df = self._config["min_df"]
117
+
118
+ # max number (fraction if float) of word occurancies
119
+ # in the document to add to vocabulary
120
+ self.max_df = self._config["max_df"]
121
+
122
+ # set ngram range
123
+ self.min_ngram = self._config["min_ngram"]
124
+ self.max_ngram = self._config["max_ngram"]
125
+
126
+ # limit vocabulary size
127
+ self.max_features = self._config["max_features"]
128
+
129
+ # if convert all characters to lowercase
130
+ self.lowercase = self._config["lowercase"]
131
+
132
+ # use the lemma of the words or not
133
+ self.use_lemma = self._config["use_lemma"]
134
+
135
+ def _load_vocabulary_params(self) -> Tuple[Text, List[Text]]:
136
+ OOV_token = self._config["OOV_token"]
137
+
138
+ OOV_words = self._config["OOV_words"]
139
+ if OOV_words and not OOV_token:
140
+ logger.error(
141
+ "The list OOV_words={} was given, but "
142
+ "OOV_token was not. OOV words are ignored."
143
+ "".format(OOV_words)
144
+ )
145
+ self.OOV_words = []
146
+
147
+ if self.lowercase and OOV_token:
148
+ # convert to lowercase
149
+ OOV_token = OOV_token.lower()
150
+ if OOV_words:
151
+ OOV_words = [w.lower() for w in OOV_words]
152
+
153
+ return OOV_token, OOV_words
154
+
155
+ def _get_attribute_vocabulary(self, attribute: Text) -> Optional[Dict[Text, int]]:
156
+ """Gets trained vocabulary from attribute's count vectorizer."""
157
+ try:
158
+ return self.vectorizers[attribute].vocabulary_
159
+ except (AttributeError, TypeError, KeyError):
160
+ return None
161
+
162
+ def _check_analyzer(self) -> None:
163
+ if self.analyzer != "word":
164
+ if self.OOV_token is not None:
165
+ logger.warning(
166
+ "Analyzer is set to character, "
167
+ "provided OOV word token will be ignored."
168
+ )
169
+ if self.stop_words is not None:
170
+ logger.warning(
171
+ "Analyzer is set to character, "
172
+ "provided stop words will be ignored."
173
+ )
174
+ if self.max_ngram == 1:
175
+ logger.warning(
176
+ "Analyzer is set to character, "
177
+ "but max n-gram is set to 1. "
178
+ "It means that the vocabulary will "
179
+ "contain single letters only."
180
+ )
181
+
182
+ @staticmethod
183
+ def _attributes_for(analyzer: Text) -> List[Text]:
184
+ """Create a list of attributes that should be featurized."""
185
+ # intents should be featurized only by word level count vectorizer
186
+ return (
187
+ MESSAGE_ATTRIBUTES if analyzer == "word" else DENSE_FEATURIZABLE_ATTRIBUTES
188
+ )
189
+
190
+ def __init__(
191
+ self,
192
+ config: Dict[Text, Any],
193
+ model_storage: ModelStorage,
194
+ resource: Resource,
195
+ execution_context: ExecutionContext,
196
+ vectorizers: Optional[Dict[Text, "CountVectorizer"]] = None,
197
+ oov_token: Optional[Text] = None,
198
+ oov_words: Optional[List[Text]] = None,
199
+ ) -> None:
200
+ """Constructs a new count vectorizer using the sklearn framework."""
201
+ super().__init__(execution_context.node_name, config)
202
+
203
+ self._model_storage = model_storage
204
+ self._resource = resource
205
+
206
+ # parameters for sklearn's CountVectorizer
207
+ self._load_count_vect_params()
208
+
209
+ # handling Out-Of-Vocabulary (OOV) words
210
+ if oov_token and oov_words:
211
+ self.OOV_token = oov_token
212
+ self.OOV_words = oov_words
213
+ else:
214
+ self.OOV_token, self.OOV_words = self._load_vocabulary_params()
215
+
216
+ # warn that some of config parameters might be ignored
217
+ self._check_analyzer()
218
+
219
+ # set which attributes to featurize
220
+ self._attributes = self._attributes_for(self.analyzer)
221
+
222
+ # declare class instance for CountVectorizer
223
+ self.vectorizers = vectorizers or {}
224
+
225
+ self.finetune_mode = execution_context.is_finetuning
226
+
227
+ @classmethod
228
+ def create(
229
+ cls,
230
+ config: Dict[Text, Any],
231
+ model_storage: ModelStorage,
232
+ resource: Resource,
233
+ execution_context: ExecutionContext,
234
+ ) -> CountVectorsFeaturizer:
235
+ """Creates a new untrained component (see parent class for full docstring)."""
236
+ return cls(config, model_storage, resource, execution_context)
237
+
238
+ def _get_message_tokens_by_attribute(
239
+ self, message: "Message", attribute: Text
240
+ ) -> List[Text]:
241
+ """Get text tokens of an attribute of a message."""
242
+ if message.get(TOKENS_NAMES[attribute]):
243
+ return [
244
+ t.lemma if self.use_lemma else t.text
245
+ for t in message.get(TOKENS_NAMES[attribute])
246
+ ]
247
+ else:
248
+ return []
249
+
250
+ def _process_tokens(self, tokens: List[Text], attribute: Text = TEXT) -> List[Text]:
251
+ """Apply processing and cleaning steps to text."""
252
+ if attribute in [INTENT, ACTION_NAME, INTENT_RESPONSE_KEY]:
253
+ # Don't do any processing for intent attribute. Treat them as whole labels
254
+ return tokens
255
+
256
+ # replace all digits with NUMBER token
257
+ tokens = [re.sub(r"\b[0-9]+\b", "__NUMBER__", text) for text in tokens]
258
+
259
+ # convert to lowercase if necessary
260
+ if self.lowercase:
261
+ tokens = [text.lower() for text in tokens]
262
+
263
+ return tokens
264
+
265
+ def _replace_with_oov_token(
266
+ self, tokens: List[Text], attribute: Text
267
+ ) -> List[Text]:
268
+ """Replace OOV words with OOV token."""
269
+ if self.OOV_token and self.analyzer == "word":
270
+ attribute_vocab = self._get_attribute_vocabulary(attribute)
271
+ if attribute_vocab is not None and self.OOV_token in attribute_vocab:
272
+ # CountVectorizer is trained, process for prediction
273
+ attribute_vocabulary_tokens = set(attribute_vocab.keys())
274
+ tokens = [
275
+ t if t in attribute_vocabulary_tokens else self.OOV_token
276
+ for t in tokens
277
+ ]
278
+ elif self.OOV_words:
279
+ # CountVectorizer is not trained, process for train
280
+ tokens = [self.OOV_token if t in self.OOV_words else t for t in tokens]
281
+
282
+ return tokens
283
+
284
+ def _get_processed_message_tokens_by_attribute(
285
+ self, message: Message, attribute: Text = TEXT
286
+ ) -> List[Text]:
287
+ """Get processed text of attribute of a message."""
288
+ if message.get(attribute) is None:
289
+ # return empty list since sklearn countvectorizer does not like None
290
+ # object while training and predicting
291
+ return []
292
+
293
+ tokens = self._get_message_tokens_by_attribute(message, attribute)
294
+ tokens = self._process_tokens(tokens, attribute)
295
+ tokens = self._replace_with_oov_token(tokens, attribute)
296
+
297
+ return tokens
298
+
299
+ # noinspection PyPep8Naming
300
+ def _check_OOV_present(self, all_tokens: List[List[Text]], attribute: Text) -> None:
301
+ """Check if an OOV word is present."""
302
+ if not self.OOV_token or self.OOV_words or not all_tokens:
303
+ return
304
+
305
+ for tokens in all_tokens:
306
+ for text in tokens:
307
+ if self.OOV_token in text or (
308
+ self.lowercase and self.OOV_token in text.lower()
309
+ ):
310
+ return
311
+
312
+ if any(text for tokens in all_tokens for text in tokens):
313
+ training_data_type = "NLU" if attribute == TEXT else "ResponseSelector"
314
+
315
+ # if there is some text in tokens, warn if there is no oov token
316
+ rasa.shared.utils.io.raise_warning(
317
+ f"The out of vocabulary token '{self.OOV_token}' was configured, but "
318
+ f"could not be found in any one of the {training_data_type} "
319
+ f"training examples. All unseen words will be "
320
+ f"ignored during prediction.",
321
+ docs=DOCS_URL_COMPONENTS + "#countvectorsfeaturizer",
322
+ )
323
+
324
+ def _get_all_attributes_processed_tokens(
325
+ self, training_data: TrainingData
326
+ ) -> Dict[Text, List[List[Text]]]:
327
+ """Get processed text for all attributes of examples in training data."""
328
+ processed_attribute_tokens = {}
329
+ for attribute in self._attributes:
330
+ all_tokens = [
331
+ self._get_processed_message_tokens_by_attribute(example, attribute)
332
+ for example in training_data.training_examples
333
+ ]
334
+ if attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
335
+ # check for oov tokens only in text based attributes
336
+ self._check_OOV_present(all_tokens, attribute)
337
+ processed_attribute_tokens[attribute] = all_tokens
338
+
339
+ return processed_attribute_tokens
340
+
341
+ @staticmethod
342
+ def _convert_attribute_tokens_to_texts(
343
+ attribute_tokens: Dict[Text, List[List[Text]]]
344
+ ) -> Dict[Text, List[Text]]:
345
+ attribute_texts = {}
346
+
347
+ for attribute in attribute_tokens.keys():
348
+ list_of_tokens = attribute_tokens[attribute]
349
+ attribute_texts[attribute] = [" ".join(tokens) for tokens in list_of_tokens]
350
+
351
+ return attribute_texts
352
+
353
+ def _update_vectorizer_vocabulary(
354
+ self, attribute: Text, new_vocabulary: Set[Text]
355
+ ) -> None:
356
+ """Updates the existing vocabulary of the vectorizer with new unseen words.
357
+
358
+ Args:
359
+ attribute: Message attribute for which vocabulary should be updated.
360
+ new_vocabulary: Set of words to expand the vocabulary with if they are
361
+ unseen.
362
+ """
363
+ existing_vocabulary: Dict[Text, int] = self.vectorizers[attribute].vocabulary
364
+ self._merge_new_vocabulary_tokens(existing_vocabulary, new_vocabulary)
365
+ self._set_vocabulary(attribute, existing_vocabulary)
366
+
367
+ def _merge_new_vocabulary_tokens(
368
+ self, existing_vocabulary: Dict[Text, int], vocabulary: Set[Text]
369
+ ) -> None:
370
+ """Merges new vocabulary tokens with the existing vocabulary.
371
+
372
+ New vocabulary items should always be added to the end of the existing
373
+ vocabulary and the order of the existing vocabulary should not be disturbed.
374
+
375
+ Args:
376
+ existing_vocabulary: existing vocabulary
377
+ vocabulary: set of new tokens
378
+
379
+ Raises:
380
+ RasaException: if `use_shared_vocab` is set to True and there are new
381
+ vocabulary items added during incremental training.
382
+ """
383
+ for token in vocabulary:
384
+ if token not in existing_vocabulary:
385
+ if self.use_shared_vocab:
386
+ raise RasaException(
387
+ "Using a shared vocabulary in `CountVectorsFeaturizer` is not "
388
+ "supported during incremental training since it requires "
389
+ "dynamically adjusting layers that correspond to label "
390
+ f"attributes such as {INTENT_RESPONSE_KEY}, {INTENT}, etc. "
391
+ "This is currently not possible. In order to avoid this "
392
+ "exception we suggest to set `use_shared_vocab=False` or train"
393
+ " from scratch."
394
+ )
395
+ existing_vocabulary[token] = len(existing_vocabulary)
396
+
397
+ def _set_vocabulary(
398
+ self, attribute: Text, original_vocabulary: Dict[Text, int]
399
+ ) -> None:
400
+ """Sets the vocabulary of the vectorizer of attribute.
401
+
402
+ Args:
403
+ attribute: Message attribute for which vocabulary should be set
404
+ original_vocabulary: Vocabulary for the attribute to be set.
405
+ """
406
+ self.vectorizers[attribute].vocabulary_ = original_vocabulary
407
+ self.vectorizers[attribute]._validate_vocabulary()
408
+
409
+ @staticmethod
410
+ def _construct_vocabulary_from_texts(
411
+ vectorizer: CountVectorizer, texts: List[Text]
412
+ ) -> Set:
413
+ """Applies vectorizer's preprocessor on texts to get the vocabulary from texts.
414
+
415
+ Args:
416
+ vectorizer: Sklearn's count vectorizer which has been pre-configured.
417
+ texts: Examples from which the vocabulary should be constructed
418
+
419
+ Returns:
420
+ Unique vocabulary words extracted.
421
+ """
422
+ analyzer = vectorizer.build_analyzer()
423
+ vocabulary_words = set()
424
+ for example in texts:
425
+ example_vocabulary: List[Text] = analyzer(example)
426
+ vocabulary_words.update(example_vocabulary)
427
+ return vocabulary_words
428
+
429
+ @staticmethod
430
+ def _attribute_texts_is_non_empty(attribute_texts: List[Text]) -> bool:
431
+ return any(attribute_texts)
432
+
433
+ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]) -> None:
434
+ """Constructs the vectorizers and train them with a shared vocab."""
435
+ combined_cleaned_texts = []
436
+ for attribute in self._attributes:
437
+ combined_cleaned_texts += attribute_texts[attribute]
438
+
439
+ # To train a shared vocabulary, we use TEXT as the
440
+ # attribute for which a combined vocabulary is built.
441
+ if not self.finetune_mode:
442
+ self.vectorizers = self._create_shared_vocab_vectorizers(
443
+ {
444
+ "strip_accents": self.strip_accents,
445
+ "lowercase": self.lowercase,
446
+ "stop_words": self.stop_words,
447
+ "min_ngram": self.min_ngram,
448
+ "max_ngram": self.max_ngram,
449
+ "max_df": self.max_df,
450
+ "min_df": self.min_df,
451
+ "max_features": self.max_features,
452
+ "analyzer": self.analyzer,
453
+ }
454
+ )
455
+ self._fit_vectorizer_from_scratch(TEXT, combined_cleaned_texts)
456
+ else:
457
+ self._fit_loaded_vectorizer(TEXT, combined_cleaned_texts)
458
+ self._log_vocabulary_stats(TEXT)
459
+
460
+ def _train_with_independent_vocab(
461
+ self, attribute_texts: Dict[Text, List[Text]]
462
+ ) -> None:
463
+ """Constructs the vectorizers and train them with an independent vocab."""
464
+ if not self.finetune_mode:
465
+ self.vectorizers = self._create_independent_vocab_vectorizers(
466
+ {
467
+ "strip_accents": self.strip_accents,
468
+ "lowercase": self.lowercase,
469
+ "stop_words": self.stop_words,
470
+ "min_ngram": self.min_ngram,
471
+ "max_ngram": self.max_ngram,
472
+ "max_df": self.max_df,
473
+ "min_df": self.min_df,
474
+ "max_features": self.max_features,
475
+ "analyzer": self.analyzer,
476
+ }
477
+ )
478
+ for attribute in self._attributes:
479
+ if self._attribute_texts_is_non_empty(attribute_texts[attribute]):
480
+ if not self.finetune_mode:
481
+ self._fit_vectorizer_from_scratch(
482
+ attribute, attribute_texts[attribute]
483
+ )
484
+ else:
485
+ self._fit_loaded_vectorizer(attribute, attribute_texts[attribute])
486
+
487
+ self._log_vocabulary_stats(attribute)
488
+ else:
489
+ logger.debug(
490
+ f"No text provided for {attribute} attribute in any messages of "
491
+ f"training data. Skipping training a CountVectorizer for it."
492
+ )
493
+
494
+ def _log_vocabulary_stats(self, attribute: Text) -> None:
495
+ """Logs number of vocabulary items that were created for a specified attribute.
496
+
497
+ Args:
498
+ attribute: Message attribute for which vocabulary stats are logged.
499
+ """
500
+ if attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
501
+ vocabulary_size = len(self.vectorizers[attribute].vocabulary_)
502
+ logger.info(
503
+ f"{vocabulary_size} vocabulary items "
504
+ f"were created for {attribute} attribute."
505
+ )
506
+
507
+ def _fit_loaded_vectorizer(
508
+ self, attribute: Text, attribute_texts: List[Text]
509
+ ) -> None:
510
+ """Fits training texts to a previously trained count vectorizer.
511
+
512
+ We do not use the `.fit()` method because the new unseen
513
+ words should occupy the buffer slots of the vocabulary.
514
+
515
+ Args:
516
+ attribute: Message attribute for which the vectorizer is to be trained.
517
+ attribute_texts: Training texts for the attribute
518
+ """
519
+ # Get vocabulary words by the preprocessor
520
+ new_vocabulary = self._construct_vocabulary_from_texts(
521
+ self.vectorizers[attribute], attribute_texts
522
+ )
523
+ # update the vocabulary of vectorizer with new vocabulary
524
+ self._update_vectorizer_vocabulary(attribute, new_vocabulary)
525
+
526
+ def _fit_vectorizer_from_scratch(
527
+ self, attribute: Text, attribute_texts: List[Text]
528
+ ) -> None:
529
+ """Fits training texts to an untrained count vectorizer.
530
+
531
+ Args:
532
+ attribute: Message attribute for which the vectorizer is to be trained.
533
+ attribute_texts: Training texts for the attribute
534
+ """
535
+ try:
536
+ self.vectorizers[attribute].fit(attribute_texts)
537
+ except ValueError:
538
+ logger.warning(
539
+ f"Unable to train CountVectorizer for message "
540
+ f"attribute {attribute} since the call to sklearn's "
541
+ f"`.fit()` method failed. Leaving an untrained "
542
+ f"CountVectorizer for it."
543
+ )
544
+
545
+ def _create_features(
546
+ self, attribute: Text, all_tokens: List[List[Text]]
547
+ ) -> Tuple[
548
+ List[Optional[scipy.sparse.spmatrix]], List[Optional[scipy.sparse.spmatrix]]
549
+ ]:
550
+ if not self.vectorizers.get(attribute):
551
+ return [None], [None]
552
+
553
+ sequence_features: List[Optional[scipy.sparse.spmatrix]] = []
554
+ sentence_features: List[Optional[scipy.sparse.spmatrix]] = []
555
+
556
+ try:
557
+ for i, tokens in enumerate(all_tokens):
558
+ # vectorizer.transform returns a sparse matrix of size
559
+ # [n_samples, n_features]
560
+ # set input to list of tokens if sequence should be returned
561
+ # otherwise join all tokens to a single string and pass that as a list
562
+ if not tokens:
563
+ # attribute is not set (e.g. response not present)
564
+ sequence_features.append(None)
565
+ sentence_features.append(None)
566
+ continue
567
+
568
+ seq_vec = self.vectorizers[attribute].transform(tokens)
569
+ seq_vec.sort_indices()
570
+
571
+ sequence_features.append(seq_vec.tocoo())
572
+
573
+ if attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
574
+ tokens_text = [" ".join(tokens)]
575
+ sentence_vec = self.vectorizers[attribute].transform(tokens_text)
576
+ sentence_vec.sort_indices()
577
+
578
+ sentence_features.append(sentence_vec.tocoo())
579
+ else:
580
+ sentence_features.append(None)
581
+ except NotFittedError:
582
+ logger.warning(
583
+ f"Unable to train CountVectorizer for message "
584
+ f"attribute - {attribute}, since the call to sklearn's "
585
+ f"`.fit()` method failed. Leaving an untrained "
586
+ f"CountVectorizer for it."
587
+ )
588
+ return [None], [None]
589
+
590
+ return sequence_features, sentence_features
591
+
592
+ def _get_featurized_attribute(
593
+ self, attribute: Text, all_tokens: List[List[Text]]
594
+ ) -> Tuple[
595
+ List[Optional[scipy.sparse.spmatrix]], List[Optional[scipy.sparse.spmatrix]]
596
+ ]:
597
+ """Returns features of a particular attribute for complete data."""
598
+ if self._get_attribute_vocabulary(attribute) is not None:
599
+ # count vectorizer was trained
600
+ return self._create_features(attribute, all_tokens)
601
+ else:
602
+ return [], []
603
+
604
+ def train(
605
+ self, training_data: TrainingData, model: Optional[SpacyModel] = None
606
+ ) -> Resource:
607
+ """Trains the featurizer.
608
+
609
+ Take parameters from config and
610
+ construct a new count vectorizer using the sklearn framework.
611
+ """
612
+ if model is not None:
613
+ # create spacy lemma_ for OOV_words
614
+ self.OOV_words = [
615
+ t.lemma_ if self.use_lemma else t.text
616
+ for w in self.OOV_words
617
+ for t in model.model(w)
618
+ ]
619
+
620
+ # process sentences and collect data for all attributes
621
+ processed_attribute_tokens = self._get_all_attributes_processed_tokens(
622
+ training_data
623
+ )
624
+
625
+ # train for all attributes
626
+ attribute_texts = self._convert_attribute_tokens_to_texts(
627
+ processed_attribute_tokens
628
+ )
629
+ if self.use_shared_vocab:
630
+ self._train_with_shared_vocab(attribute_texts)
631
+ else:
632
+ self._train_with_independent_vocab(attribute_texts)
633
+
634
+ self.persist()
635
+
636
+ return self._resource
637
+
638
+ def process_training_data(self, training_data: TrainingData) -> TrainingData:
639
+ """Processes the training examples in the given training data in-place.
640
+
641
+ Args:
642
+ training_data: the training data
643
+
644
+ Returns:
645
+ same training data after processing
646
+ """
647
+ self.process(training_data.training_examples)
648
+ return training_data
649
+
650
+ def process(self, messages: List[Message]) -> List[Message]:
651
+ """Processes incoming message and compute and set features."""
652
+ if self.vectorizers is None:
653
+ logger.error(
654
+ "There is no trained CountVectorizer: "
655
+ "component is either not trained or "
656
+ "didn't receive enough training data"
657
+ )
658
+ return messages
659
+
660
+ for message in messages:
661
+ for attribute in self._attributes:
662
+
663
+ message_tokens = self._get_processed_message_tokens_by_attribute(
664
+ message, attribute
665
+ )
666
+
667
+ # features shape (1, seq, dim)
668
+ sequence_features, sentence_features = self._create_features(
669
+ attribute, [message_tokens]
670
+ )
671
+ self.add_features_to_message(
672
+ sequence_features[0], sentence_features[0], attribute, message
673
+ )
674
+
675
+ return messages
676
+
677
+ def _collect_vectorizer_vocabularies(self) -> Dict[Text, Optional[Dict[Text, int]]]:
678
+ """Gets vocabulary for all attributes."""
679
+ attribute_vocabularies = {}
680
+ for attribute in self._attributes:
681
+ attribute_vocabularies[attribute] = self._get_attribute_vocabulary(
682
+ attribute
683
+ )
684
+ return attribute_vocabularies
685
+
686
+ @staticmethod
687
+ def _is_any_model_trained(
688
+ attribute_vocabularies: Dict[Text, Optional[Dict[Text, int]]]
689
+ ) -> bool:
690
+ """Check if any model got trained."""
691
+ return any(value is not None for value in attribute_vocabularies.values())
692
+
693
+ def persist(self) -> None:
694
+ """Persist this model into the passed directory.
695
+
696
+ Returns the metadata necessary to load the model again.
697
+ """
698
+ if not self.vectorizers:
699
+ return
700
+
701
+ with self._model_storage.write_to(self._resource) as model_dir:
702
+ # vectorizer instance was not None, some models could have been trained
703
+ attribute_vocabularies = self._collect_vectorizer_vocabularies()
704
+ if self._is_any_model_trained(attribute_vocabularies):
705
+ # Definitely need to persist some vocabularies
706
+ featurizer_file = model_dir / "vocabularies.pkl"
707
+
708
+ # Only persist vocabulary from one attribute if `use_shared_vocab`.
709
+ # Can be loaded and distributed to all attributes.
710
+ vocab = (
711
+ attribute_vocabularies[TEXT]
712
+ if self.use_shared_vocab
713
+ else attribute_vocabularies
714
+ )
715
+
716
+ io_utils.json_pickle(featurizer_file, vocab)
717
+
718
+ # Dump OOV words separately as they might have been modified during
719
+ # training
720
+ rasa.shared.utils.io.dump_obj_as_json_to_file(
721
+ model_dir / "oov_words.json", self.OOV_words
722
+ )
723
+
724
+ @classmethod
725
+ def _create_shared_vocab_vectorizers(
726
+ cls, parameters: Dict[Text, Any], vocabulary: Optional[Any] = None
727
+ ) -> Dict[Text, CountVectorizer]:
728
+ """Create vectorizers for all attributes with shared vocabulary."""
729
+ shared_vectorizer = CountVectorizer(
730
+ token_pattern=r"(?u)\b\w+\b" if parameters["analyzer"] == "word" else None,
731
+ strip_accents=parameters["strip_accents"],
732
+ lowercase=parameters["lowercase"],
733
+ stop_words=parameters["stop_words"],
734
+ ngram_range=(parameters["min_ngram"], parameters["max_ngram"]),
735
+ max_df=parameters["max_df"],
736
+ min_df=parameters["min_df"],
737
+ max_features=parameters["max_features"],
738
+ analyzer=parameters["analyzer"],
739
+ vocabulary=vocabulary,
740
+ )
741
+
742
+ attribute_vectorizers = {}
743
+
744
+ for attribute in cls._attributes_for(parameters["analyzer"]):
745
+ attribute_vectorizers[attribute] = shared_vectorizer
746
+
747
+ return attribute_vectorizers
748
+
749
+ @classmethod
750
+ def _create_independent_vocab_vectorizers(
751
+ cls, parameters: Dict[Text, Any], vocabulary: Optional[Any] = None
752
+ ) -> Dict[Text, CountVectorizer]:
753
+ """Create vectorizers for all attributes with independent vocabulary."""
754
+ attribute_vectorizers = {}
755
+
756
+ for attribute in cls._attributes_for(parameters["analyzer"]):
757
+ attribute_vocabulary = vocabulary[attribute] if vocabulary else None
758
+
759
+ attribute_vectorizer = CountVectorizer(
760
+ token_pattern=r"(?u)\b\w+\b"
761
+ if parameters["analyzer"] == "word"
762
+ else None,
763
+ strip_accents=parameters["strip_accents"],
764
+ lowercase=parameters["lowercase"],
765
+ stop_words=parameters["stop_words"],
766
+ ngram_range=(parameters["min_ngram"], parameters["max_ngram"]),
767
+ max_df=parameters["max_df"],
768
+ min_df=parameters["min_df"]
769
+ if attribute == rasa.shared.nlu.constants.TEXT
770
+ else 1,
771
+ max_features=parameters["max_features"],
772
+ analyzer=parameters["analyzer"],
773
+ vocabulary=attribute_vocabulary,
774
+ )
775
+ attribute_vectorizers[attribute] = attribute_vectorizer
776
+
777
+ return attribute_vectorizers
778
+
779
+ @classmethod
780
+ def load(
781
+ cls,
782
+ config: Dict[Text, Any],
783
+ model_storage: ModelStorage,
784
+ resource: Resource,
785
+ execution_context: ExecutionContext,
786
+ **kwargs: Any,
787
+ ) -> CountVectorsFeaturizer:
788
+ """Loads trained component (see parent class for full docstring)."""
789
+ try:
790
+ with model_storage.read_from(resource) as model_dir:
791
+ featurizer_file = model_dir / "vocabularies.pkl"
792
+ vocabulary = io_utils.json_unpickle(featurizer_file)
793
+
794
+ share_vocabulary = config["use_shared_vocab"]
795
+
796
+ if share_vocabulary:
797
+ vectorizers = cls._create_shared_vocab_vectorizers(
798
+ config, vocabulary=vocabulary
799
+ )
800
+ else:
801
+ vectorizers = cls._create_independent_vocab_vectorizers(
802
+ config, vocabulary=vocabulary
803
+ )
804
+
805
+ oov_words = rasa.shared.utils.io.read_json_file(
806
+ model_dir / "oov_words.json"
807
+ )
808
+
809
+ ftr = cls(
810
+ config,
811
+ model_storage,
812
+ resource,
813
+ execution_context,
814
+ vectorizers=vectorizers,
815
+ oov_token=config["OOV_token"],
816
+ oov_words=oov_words,
817
+ )
818
+
819
+ # make sure the vocabulary has been loaded correctly
820
+ for attribute in vectorizers:
821
+ ftr.vectorizers[attribute]._validate_vocabulary()
822
+
823
+ return ftr
824
+
825
+ except (ValueError, FileNotFoundError, FileIOException):
826
+ logger.debug(
827
+ f"Failed to load `{cls.__class__.__name__}` from model storage. "
828
+ f"Resource '{resource.name}' doesn't exist."
829
+ )
830
+ return cls(
831
+ config=config,
832
+ model_storage=model_storage,
833
+ resource=resource,
834
+ execution_context=execution_context,
835
+ )
836
+
837
+ @classmethod
838
+ def validate_config(cls, config: Dict[Text, Any]) -> None:
839
+ """Validates that the component is configured properly."""
840
+ pass