eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,1689 @@
1
+ # Copyright 2023 The Google Research Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # mypy: ignore-errors
16
+
17
+ """Utility library of instructions."""
18
+
19
+ import functools
20
+ import os
21
+ import random
22
+ import re
23
+
24
+ import nltk
25
+
26
+ from eval_framework.logger import logger
27
+
28
+
29
+ def download_nltk_resources():
30
+ """Download 'punkt' if not already installed"""
31
+ try:
32
+ nltk.data.find("tokenizers/punkt_tab")
33
+ except LookupError:
34
+ if os.environ.get("LOCAL_RANK", "0") == "0":
35
+ nltk.download("punkt_tab")
36
+ logger.info("Downloaded punkt_tab on rank 0")
37
+
38
+
39
+ download_nltk_resources()
40
+
41
+ WORD_LIST = [
42
+ "western",
43
+ "sentence",
44
+ "signal",
45
+ "dump",
46
+ "spot",
47
+ "opposite",
48
+ "bottom",
49
+ "potato",
50
+ "administration",
51
+ "working",
52
+ "welcome",
53
+ "morning",
54
+ "good",
55
+ "agency",
56
+ "primary",
57
+ "wish",
58
+ "responsibility",
59
+ "press",
60
+ "problem",
61
+ "president",
62
+ "steal",
63
+ "brush",
64
+ "read",
65
+ "type",
66
+ "beat",
67
+ "trainer",
68
+ "growth",
69
+ "lock",
70
+ "bone",
71
+ "case",
72
+ "equal",
73
+ "comfortable",
74
+ "region",
75
+ "replacement",
76
+ "performance",
77
+ "mate",
78
+ "walk",
79
+ "medicine",
80
+ "film",
81
+ "thing",
82
+ "rock",
83
+ "tap",
84
+ "total",
85
+ "competition",
86
+ "ease",
87
+ "south",
88
+ "establishment",
89
+ "gather",
90
+ "parking",
91
+ "world",
92
+ "plenty",
93
+ "breath",
94
+ "claim",
95
+ "alcohol",
96
+ "trade",
97
+ "dear",
98
+ "highlight",
99
+ "street",
100
+ "matter",
101
+ "decision",
102
+ "mess",
103
+ "agreement",
104
+ "studio",
105
+ "coach",
106
+ "assist",
107
+ "brain",
108
+ "wing",
109
+ "style",
110
+ "private",
111
+ "top",
112
+ "brown",
113
+ "leg",
114
+ "buy",
115
+ "procedure",
116
+ "method",
117
+ "speed",
118
+ "high",
119
+ "company",
120
+ "valuable",
121
+ "pie",
122
+ "analyst",
123
+ "session",
124
+ "pattern",
125
+ "district",
126
+ "pleasure",
127
+ "dinner",
128
+ "swimming",
129
+ "joke",
130
+ "order",
131
+ "plate",
132
+ "department",
133
+ "motor",
134
+ "cell",
135
+ "spend",
136
+ "cabinet",
137
+ "difference",
138
+ "power",
139
+ "examination",
140
+ "engine",
141
+ "horse",
142
+ "dimension",
143
+ "pay",
144
+ "toe",
145
+ "curve",
146
+ "literature",
147
+ "bother",
148
+ "fire",
149
+ "possibility",
150
+ "debate",
151
+ "activity",
152
+ "passage",
153
+ "hello",
154
+ "cycle",
155
+ "background",
156
+ "quiet",
157
+ "author",
158
+ "effect",
159
+ "actor",
160
+ "page",
161
+ "bicycle",
162
+ "error",
163
+ "throat",
164
+ "attack",
165
+ "character",
166
+ "phone",
167
+ "tea",
168
+ "increase",
169
+ "outcome",
170
+ "file",
171
+ "specific",
172
+ "inspector",
173
+ "internal",
174
+ "potential",
175
+ "staff",
176
+ "building",
177
+ "employer",
178
+ "shoe",
179
+ "hand",
180
+ "direction",
181
+ "garden",
182
+ "purchase",
183
+ "interview",
184
+ "study",
185
+ "recognition",
186
+ "member",
187
+ "spiritual",
188
+ "oven",
189
+ "sandwich",
190
+ "weird",
191
+ "passenger",
192
+ "particular",
193
+ "response",
194
+ "reaction",
195
+ "size",
196
+ "variation",
197
+ "a",
198
+ "cancel",
199
+ "candy",
200
+ "exit",
201
+ "guest",
202
+ "condition",
203
+ "fly",
204
+ "price",
205
+ "weakness",
206
+ "convert",
207
+ "hotel",
208
+ "great",
209
+ "mouth",
210
+ "mind",
211
+ "song",
212
+ "sugar",
213
+ "suspect",
214
+ "telephone",
215
+ "ear",
216
+ "roof",
217
+ "paint",
218
+ "refrigerator",
219
+ "organization",
220
+ "jury",
221
+ "reward",
222
+ "engineering",
223
+ "day",
224
+ "possession",
225
+ "crew",
226
+ "bar",
227
+ "road",
228
+ "description",
229
+ "celebration",
230
+ "score",
231
+ "mark",
232
+ "letter",
233
+ "shower",
234
+ "suggestion",
235
+ "sir",
236
+ "luck",
237
+ "national",
238
+ "progress",
239
+ "hall",
240
+ "stroke",
241
+ "theory",
242
+ "offer",
243
+ "story",
244
+ "tax",
245
+ "definition",
246
+ "history",
247
+ "ride",
248
+ "medium",
249
+ "opening",
250
+ "glass",
251
+ "elevator",
252
+ "stomach",
253
+ "question",
254
+ "ability",
255
+ "leading",
256
+ "village",
257
+ "computer",
258
+ "city",
259
+ "grand",
260
+ "confidence",
261
+ "candle",
262
+ "priest",
263
+ "recommendation",
264
+ "point",
265
+ "necessary",
266
+ "body",
267
+ "desk",
268
+ "secret",
269
+ "horror",
270
+ "noise",
271
+ "culture",
272
+ "warning",
273
+ "water",
274
+ "round",
275
+ "diet",
276
+ "flower",
277
+ "bus",
278
+ "tough",
279
+ "permission",
280
+ "week",
281
+ "prompt",
282
+ "connection",
283
+ "abuse",
284
+ "height",
285
+ "save",
286
+ "corner",
287
+ "border",
288
+ "stress",
289
+ "drive",
290
+ "stop",
291
+ "rip",
292
+ "meal",
293
+ "listen",
294
+ "confusion",
295
+ "girlfriend",
296
+ "living",
297
+ "relation",
298
+ "significance",
299
+ "plan",
300
+ "creative",
301
+ "atmosphere",
302
+ "blame",
303
+ "invite",
304
+ "housing",
305
+ "paper",
306
+ "drink",
307
+ "roll",
308
+ "silver",
309
+ "drunk",
310
+ "age",
311
+ "damage",
312
+ "smoke",
313
+ "environment",
314
+ "pack",
315
+ "savings",
316
+ "influence",
317
+ "tourist",
318
+ "rain",
319
+ "post",
320
+ "sign",
321
+ "grandmother",
322
+ "run",
323
+ "profit",
324
+ "push",
325
+ "clerk",
326
+ "final",
327
+ "wine",
328
+ "swim",
329
+ "pause",
330
+ "stuff",
331
+ "singer",
332
+ "funeral",
333
+ "average",
334
+ "source",
335
+ "scene",
336
+ "tradition",
337
+ "personal",
338
+ "snow",
339
+ "nobody",
340
+ "distance",
341
+ "sort",
342
+ "sensitive",
343
+ "animal",
344
+ "major",
345
+ "negotiation",
346
+ "click",
347
+ "mood",
348
+ "period",
349
+ "arrival",
350
+ "expression",
351
+ "holiday",
352
+ "repeat",
353
+ "dust",
354
+ "closet",
355
+ "gold",
356
+ "bad",
357
+ "sail",
358
+ "combination",
359
+ "clothes",
360
+ "emphasis",
361
+ "duty",
362
+ "black",
363
+ "step",
364
+ "school",
365
+ "jump",
366
+ "document",
367
+ "professional",
368
+ "lip",
369
+ "chemical",
370
+ "front",
371
+ "wake",
372
+ "while",
373
+ "inside",
374
+ "watch",
375
+ "row",
376
+ "subject",
377
+ "penalty",
378
+ "balance",
379
+ "possible",
380
+ "adult",
381
+ "aside",
382
+ "sample",
383
+ "appeal",
384
+ "wedding",
385
+ "depth",
386
+ "king",
387
+ "award",
388
+ "wife",
389
+ "blow",
390
+ "site",
391
+ "camp",
392
+ "music",
393
+ "safe",
394
+ "gift",
395
+ "fault",
396
+ "guess",
397
+ "act",
398
+ "shame",
399
+ "drama",
400
+ "capital",
401
+ "exam",
402
+ "stupid",
403
+ "record",
404
+ "sound",
405
+ "swing",
406
+ "novel",
407
+ "minimum",
408
+ "ratio",
409
+ "machine",
410
+ "shape",
411
+ "lead",
412
+ "operation",
413
+ "salary",
414
+ "cloud",
415
+ "affair",
416
+ "hit",
417
+ "chapter",
418
+ "stage",
419
+ "quantity",
420
+ "access",
421
+ "army",
422
+ "chain",
423
+ "traffic",
424
+ "kick",
425
+ "analysis",
426
+ "airport",
427
+ "time",
428
+ "vacation",
429
+ "philosophy",
430
+ "ball",
431
+ "chest",
432
+ "thanks",
433
+ "place",
434
+ "mountain",
435
+ "advertising",
436
+ "red",
437
+ "past",
438
+ "rent",
439
+ "return",
440
+ "tour",
441
+ "house",
442
+ "construction",
443
+ "net",
444
+ "native",
445
+ "war",
446
+ "figure",
447
+ "fee",
448
+ "spray",
449
+ "user",
450
+ "dirt",
451
+ "shot",
452
+ "task",
453
+ "stick",
454
+ "friend",
455
+ "software",
456
+ "promotion",
457
+ "interaction",
458
+ "surround",
459
+ "block",
460
+ "purpose",
461
+ "practice",
462
+ "conflict",
463
+ "routine",
464
+ "requirement",
465
+ "bonus",
466
+ "hole",
467
+ "state",
468
+ "junior",
469
+ "sweet",
470
+ "catch",
471
+ "tear",
472
+ "fold",
473
+ "wall",
474
+ "editor",
475
+ "life",
476
+ "position",
477
+ "pound",
478
+ "respect",
479
+ "bathroom",
480
+ "coat",
481
+ "script",
482
+ "job",
483
+ "teach",
484
+ "birth",
485
+ "view",
486
+ "resolve",
487
+ "theme",
488
+ "employee",
489
+ "doubt",
490
+ "market",
491
+ "education",
492
+ "serve",
493
+ "recover",
494
+ "tone",
495
+ "harm",
496
+ "miss",
497
+ "union",
498
+ "understanding",
499
+ "cow",
500
+ "river",
501
+ "association",
502
+ "concept",
503
+ "training",
504
+ "recipe",
505
+ "relationship",
506
+ "reserve",
507
+ "depression",
508
+ "proof",
509
+ "hair",
510
+ "revenue",
511
+ "independent",
512
+ "lift",
513
+ "assignment",
514
+ "temporary",
515
+ "amount",
516
+ "loss",
517
+ "edge",
518
+ "track",
519
+ "check",
520
+ "rope",
521
+ "estimate",
522
+ "pollution",
523
+ "stable",
524
+ "message",
525
+ "delivery",
526
+ "perspective",
527
+ "mirror",
528
+ "assistant",
529
+ "representative",
530
+ "witness",
531
+ "nature",
532
+ "judge",
533
+ "fruit",
534
+ "tip",
535
+ "devil",
536
+ "town",
537
+ "emergency",
538
+ "upper",
539
+ "drop",
540
+ "stay",
541
+ "human",
542
+ "neck",
543
+ "speaker",
544
+ "network",
545
+ "sing",
546
+ "resist",
547
+ "league",
548
+ "trip",
549
+ "signature",
550
+ "lawyer",
551
+ "importance",
552
+ "gas",
553
+ "choice",
554
+ "engineer",
555
+ "success",
556
+ "part",
557
+ "external",
558
+ "worker",
559
+ "simple",
560
+ "quarter",
561
+ "student",
562
+ "heart",
563
+ "pass",
564
+ "spite",
565
+ "shift",
566
+ "rough",
567
+ "lady",
568
+ "grass",
569
+ "community",
570
+ "garage",
571
+ "youth",
572
+ "standard",
573
+ "skirt",
574
+ "promise",
575
+ "blind",
576
+ "television",
577
+ "disease",
578
+ "commission",
579
+ "positive",
580
+ "energy",
581
+ "calm",
582
+ "presence",
583
+ "tune",
584
+ "basis",
585
+ "preference",
586
+ "head",
587
+ "common",
588
+ "cut",
589
+ "somewhere",
590
+ "presentation",
591
+ "current",
592
+ "thought",
593
+ "revolution",
594
+ "effort",
595
+ "master",
596
+ "implement",
597
+ "republic",
598
+ "floor",
599
+ "principle",
600
+ "stranger",
601
+ "shoulder",
602
+ "grade",
603
+ "button",
604
+ "tennis",
605
+ "police",
606
+ "collection",
607
+ "account",
608
+ "register",
609
+ "glove",
610
+ "divide",
611
+ "professor",
612
+ "chair",
613
+ "priority",
614
+ "combine",
615
+ "peace",
616
+ "extension",
617
+ "maybe",
618
+ "evening",
619
+ "frame",
620
+ "sister",
621
+ "wave",
622
+ "code",
623
+ "application",
624
+ "mouse",
625
+ "match",
626
+ "counter",
627
+ "bottle",
628
+ "half",
629
+ "cheek",
630
+ "resolution",
631
+ "back",
632
+ "knowledge",
633
+ "make",
634
+ "discussion",
635
+ "screw",
636
+ "length",
637
+ "accident",
638
+ "battle",
639
+ "dress",
640
+ "knee",
641
+ "log",
642
+ "package",
643
+ "it",
644
+ "turn",
645
+ "hearing",
646
+ "newspaper",
647
+ "layer",
648
+ "wealth",
649
+ "profile",
650
+ "imagination",
651
+ "answer",
652
+ "weekend",
653
+ "teacher",
654
+ "appearance",
655
+ "meet",
656
+ "bike",
657
+ "rise",
658
+ "belt",
659
+ "crash",
660
+ "bowl",
661
+ "equivalent",
662
+ "support",
663
+ "image",
664
+ "poem",
665
+ "risk",
666
+ "excitement",
667
+ "remote",
668
+ "secretary",
669
+ "public",
670
+ "produce",
671
+ "plane",
672
+ "display",
673
+ "money",
674
+ "sand",
675
+ "situation",
676
+ "punch",
677
+ "customer",
678
+ "title",
679
+ "shake",
680
+ "mortgage",
681
+ "option",
682
+ "number",
683
+ "pop",
684
+ "window",
685
+ "extent",
686
+ "nothing",
687
+ "experience",
688
+ "opinion",
689
+ "departure",
690
+ "dance",
691
+ "indication",
692
+ "boy",
693
+ "material",
694
+ "band",
695
+ "leader",
696
+ "sun",
697
+ "beautiful",
698
+ "muscle",
699
+ "farmer",
700
+ "variety",
701
+ "fat",
702
+ "handle",
703
+ "director",
704
+ "opportunity",
705
+ "calendar",
706
+ "outside",
707
+ "pace",
708
+ "bath",
709
+ "fish",
710
+ "consequence",
711
+ "put",
712
+ "owner",
713
+ "go",
714
+ "doctor",
715
+ "information",
716
+ "share",
717
+ "hurt",
718
+ "protection",
719
+ "career",
720
+ "finance",
721
+ "force",
722
+ "golf",
723
+ "garbage",
724
+ "aspect",
725
+ "kid",
726
+ "food",
727
+ "boot",
728
+ "milk",
729
+ "respond",
730
+ "objective",
731
+ "reality",
732
+ "raw",
733
+ "ring",
734
+ "mall",
735
+ "one",
736
+ "impact",
737
+ "area",
738
+ "news",
739
+ "international",
740
+ "series",
741
+ "impress",
742
+ "mother",
743
+ "shelter",
744
+ "strike",
745
+ "loan",
746
+ "month",
747
+ "seat",
748
+ "anything",
749
+ "entertainment",
750
+ "familiar",
751
+ "clue",
752
+ "year",
753
+ "glad",
754
+ "supermarket",
755
+ "natural",
756
+ "god",
757
+ "cost",
758
+ "conversation",
759
+ "tie",
760
+ "ruin",
761
+ "comfort",
762
+ "earth",
763
+ "storm",
764
+ "percentage",
765
+ "assistance",
766
+ "budget",
767
+ "strength",
768
+ "beginning",
769
+ "sleep",
770
+ "other",
771
+ "young",
772
+ "unit",
773
+ "fill",
774
+ "store",
775
+ "desire",
776
+ "hide",
777
+ "value",
778
+ "cup",
779
+ "maintenance",
780
+ "nurse",
781
+ "function",
782
+ "tower",
783
+ "role",
784
+ "class",
785
+ "camera",
786
+ "database",
787
+ "panic",
788
+ "nation",
789
+ "basket",
790
+ "ice",
791
+ "art",
792
+ "spirit",
793
+ "chart",
794
+ "exchange",
795
+ "feedback",
796
+ "statement",
797
+ "reputation",
798
+ "search",
799
+ "hunt",
800
+ "exercise",
801
+ "nasty",
802
+ "notice",
803
+ "male",
804
+ "yard",
805
+ "annual",
806
+ "collar",
807
+ "date",
808
+ "platform",
809
+ "plant",
810
+ "fortune",
811
+ "passion",
812
+ "friendship",
813
+ "spread",
814
+ "cancer",
815
+ "ticket",
816
+ "attitude",
817
+ "island",
818
+ "active",
819
+ "object",
820
+ "service",
821
+ "buyer",
822
+ "bite",
823
+ "card",
824
+ "face",
825
+ "steak",
826
+ "proposal",
827
+ "patient",
828
+ "heat",
829
+ "rule",
830
+ "resident",
831
+ "broad",
832
+ "politics",
833
+ "west",
834
+ "knife",
835
+ "expert",
836
+ "girl",
837
+ "design",
838
+ "salt",
839
+ "baseball",
840
+ "grab",
841
+ "inspection",
842
+ "cousin",
843
+ "couple",
844
+ "magazine",
845
+ "cook",
846
+ "dependent",
847
+ "security",
848
+ "chicken",
849
+ "version",
850
+ "currency",
851
+ "ladder",
852
+ "scheme",
853
+ "kitchen",
854
+ "employment",
855
+ "local",
856
+ "attention",
857
+ "manager",
858
+ "fact",
859
+ "cover",
860
+ "sad",
861
+ "guard",
862
+ "relative",
863
+ "county",
864
+ "rate",
865
+ "lunch",
866
+ "program",
867
+ "initiative",
868
+ "gear",
869
+ "bridge",
870
+ "breast",
871
+ "talk",
872
+ "dish",
873
+ "guarantee",
874
+ "beer",
875
+ "vehicle",
876
+ "reception",
877
+ "woman",
878
+ "substance",
879
+ "copy",
880
+ "lecture",
881
+ "advantage",
882
+ "park",
883
+ "cold",
884
+ "death",
885
+ "mix",
886
+ "hold",
887
+ "scale",
888
+ "tomorrow",
889
+ "blood",
890
+ "request",
891
+ "green",
892
+ "cookie",
893
+ "church",
894
+ "strip",
895
+ "forever",
896
+ "beyond",
897
+ "debt",
898
+ "tackle",
899
+ "wash",
900
+ "following",
901
+ "feel",
902
+ "maximum",
903
+ "sector",
904
+ "sea",
905
+ "property",
906
+ "economics",
907
+ "menu",
908
+ "bench",
909
+ "try",
910
+ "language",
911
+ "start",
912
+ "call",
913
+ "solid",
914
+ "address",
915
+ "income",
916
+ "foot",
917
+ "senior",
918
+ "honey",
919
+ "few",
920
+ "mixture",
921
+ "cash",
922
+ "grocery",
923
+ "link",
924
+ "map",
925
+ "form",
926
+ "factor",
927
+ "pot",
928
+ "model",
929
+ "writer",
930
+ "farm",
931
+ "winter",
932
+ "skill",
933
+ "anywhere",
934
+ "birthday",
935
+ "policy",
936
+ "release",
937
+ "husband",
938
+ "lab",
939
+ "hurry",
940
+ "mail",
941
+ "equipment",
942
+ "sink",
943
+ "pair",
944
+ "driver",
945
+ "consideration",
946
+ "leather",
947
+ "skin",
948
+ "blue",
949
+ "boat",
950
+ "sale",
951
+ "brick",
952
+ "two",
953
+ "feed",
954
+ "square",
955
+ "dot",
956
+ "rush",
957
+ "dream",
958
+ "location",
959
+ "afternoon",
960
+ "manufacturer",
961
+ "control",
962
+ "occasion",
963
+ "trouble",
964
+ "introduction",
965
+ "advice",
966
+ "bet",
967
+ "eat",
968
+ "kill",
969
+ "category",
970
+ "manner",
971
+ "office",
972
+ "estate",
973
+ "pride",
974
+ "awareness",
975
+ "slip",
976
+ "crack",
977
+ "client",
978
+ "nail",
979
+ "shoot",
980
+ "membership",
981
+ "soft",
982
+ "anybody",
983
+ "web",
984
+ "official",
985
+ "individual",
986
+ "pizza",
987
+ "interest",
988
+ "bag",
989
+ "spell",
990
+ "profession",
991
+ "queen",
992
+ "deal",
993
+ "resource",
994
+ "ship",
995
+ "guy",
996
+ "chocolate",
997
+ "joint",
998
+ "formal",
999
+ "upstairs",
1000
+ "car",
1001
+ "resort",
1002
+ "abroad",
1003
+ "dealer",
1004
+ "associate",
1005
+ "finger",
1006
+ "surgery",
1007
+ "comment",
1008
+ "team",
1009
+ "detail",
1010
+ "crazy",
1011
+ "path",
1012
+ "tale",
1013
+ "initial",
1014
+ "arm",
1015
+ "radio",
1016
+ "demand",
1017
+ "single",
1018
+ "draw",
1019
+ "yellow",
1020
+ "contest",
1021
+ "piece",
1022
+ "quote",
1023
+ "pull",
1024
+ "commercial",
1025
+ "shirt",
1026
+ "contribution",
1027
+ "cream",
1028
+ "channel",
1029
+ "suit",
1030
+ "discipline",
1031
+ "instruction",
1032
+ "concert",
1033
+ "speech",
1034
+ "low",
1035
+ "effective",
1036
+ "hang",
1037
+ "scratch",
1038
+ "industry",
1039
+ "breakfast",
1040
+ "lay",
1041
+ "join",
1042
+ "metal",
1043
+ "bedroom",
1044
+ "minute",
1045
+ "product",
1046
+ "rest",
1047
+ "temperature",
1048
+ "many",
1049
+ "give",
1050
+ "argument",
1051
+ "print",
1052
+ "purple",
1053
+ "laugh",
1054
+ "health",
1055
+ "credit",
1056
+ "investment",
1057
+ "sell",
1058
+ "setting",
1059
+ "lesson",
1060
+ "egg",
1061
+ "middle",
1062
+ "marriage",
1063
+ "level",
1064
+ "evidence",
1065
+ "phrase",
1066
+ "love",
1067
+ "self",
1068
+ "benefit",
1069
+ "guidance",
1070
+ "affect",
1071
+ "you",
1072
+ "dad",
1073
+ "anxiety",
1074
+ "special",
1075
+ "boyfriend",
1076
+ "test",
1077
+ "blank",
1078
+ "payment",
1079
+ "soup",
1080
+ "obligation",
1081
+ "reply",
1082
+ "smile",
1083
+ "deep",
1084
+ "complaint",
1085
+ "addition",
1086
+ "review",
1087
+ "box",
1088
+ "towel",
1089
+ "minor",
1090
+ "fun",
1091
+ "soil",
1092
+ "issue",
1093
+ "cigarette",
1094
+ "internet",
1095
+ "gain",
1096
+ "tell",
1097
+ "entry",
1098
+ "spare",
1099
+ "incident",
1100
+ "family",
1101
+ "refuse",
1102
+ "branch",
1103
+ "can",
1104
+ "pen",
1105
+ "grandfather",
1106
+ "constant",
1107
+ "tank",
1108
+ "uncle",
1109
+ "climate",
1110
+ "ground",
1111
+ "volume",
1112
+ "communication",
1113
+ "kind",
1114
+ "poet",
1115
+ "child",
1116
+ "screen",
1117
+ "mine",
1118
+ "quit",
1119
+ "gene",
1120
+ "lack",
1121
+ "charity",
1122
+ "memory",
1123
+ "tooth",
1124
+ "fear",
1125
+ "mention",
1126
+ "marketing",
1127
+ "reveal",
1128
+ "reason",
1129
+ "court",
1130
+ "season",
1131
+ "freedom",
1132
+ "land",
1133
+ "sport",
1134
+ "audience",
1135
+ "classroom",
1136
+ "law",
1137
+ "hook",
1138
+ "win",
1139
+ "carry",
1140
+ "eye",
1141
+ "smell",
1142
+ "distribution",
1143
+ "research",
1144
+ "country",
1145
+ "dare",
1146
+ "hope",
1147
+ "whereas",
1148
+ "stretch",
1149
+ "library",
1150
+ "if",
1151
+ "delay",
1152
+ "college",
1153
+ "plastic",
1154
+ "book",
1155
+ "present",
1156
+ "use",
1157
+ "worry",
1158
+ "champion",
1159
+ "goal",
1160
+ "economy",
1161
+ "march",
1162
+ "election",
1163
+ "reflection",
1164
+ "midnight",
1165
+ "slide",
1166
+ "inflation",
1167
+ "action",
1168
+ "challenge",
1169
+ "guitar",
1170
+ "coast",
1171
+ "apple",
1172
+ "campaign",
1173
+ "field",
1174
+ "jacket",
1175
+ "sense",
1176
+ "way",
1177
+ "visual",
1178
+ "remove",
1179
+ "weather",
1180
+ "trash",
1181
+ "cable",
1182
+ "regret",
1183
+ "buddy",
1184
+ "beach",
1185
+ "historian",
1186
+ "courage",
1187
+ "sympathy",
1188
+ "truck",
1189
+ "tension",
1190
+ "permit",
1191
+ "nose",
1192
+ "bed",
1193
+ "son",
1194
+ "person",
1195
+ "base",
1196
+ "meat",
1197
+ "usual",
1198
+ "air",
1199
+ "meeting",
1200
+ "worth",
1201
+ "game",
1202
+ "independence",
1203
+ "physical",
1204
+ "brief",
1205
+ "play",
1206
+ "raise",
1207
+ "board",
1208
+ "she",
1209
+ "key",
1210
+ "writing",
1211
+ "pick",
1212
+ "command",
1213
+ "party",
1214
+ "yesterday",
1215
+ "spring",
1216
+ "candidate",
1217
+ "physics",
1218
+ "university",
1219
+ "concern",
1220
+ "development",
1221
+ "change",
1222
+ "string",
1223
+ "target",
1224
+ "instance",
1225
+ "room",
1226
+ "bitter",
1227
+ "bird",
1228
+ "football",
1229
+ "normal",
1230
+ "split",
1231
+ "impression",
1232
+ "wood",
1233
+ "long",
1234
+ "meaning",
1235
+ "stock",
1236
+ "cap",
1237
+ "leadership",
1238
+ "media",
1239
+ "ambition",
1240
+ "fishing",
1241
+ "essay",
1242
+ "salad",
1243
+ "repair",
1244
+ "today",
1245
+ "designer",
1246
+ "night",
1247
+ "bank",
1248
+ "drawing",
1249
+ "inevitable",
1250
+ "phase",
1251
+ "vast",
1252
+ "chip",
1253
+ "anger",
1254
+ "switch",
1255
+ "cry",
1256
+ "twist",
1257
+ "personality",
1258
+ "attempt",
1259
+ "storage",
1260
+ "being",
1261
+ "preparation",
1262
+ "bat",
1263
+ "selection",
1264
+ "white",
1265
+ "technology",
1266
+ "contract",
1267
+ "side",
1268
+ "section",
1269
+ "station",
1270
+ "till",
1271
+ "structure",
1272
+ "tongue",
1273
+ "taste",
1274
+ "truth",
1275
+ "difficulty",
1276
+ "group",
1277
+ "limit",
1278
+ "main",
1279
+ "move",
1280
+ "feeling",
1281
+ "light",
1282
+ "example",
1283
+ "mission",
1284
+ "might",
1285
+ "wait",
1286
+ "wheel",
1287
+ "shop",
1288
+ "host",
1289
+ "classic",
1290
+ "alternative",
1291
+ "cause",
1292
+ "agent",
1293
+ "consist",
1294
+ "table",
1295
+ "airline",
1296
+ "text",
1297
+ "pool",
1298
+ "craft",
1299
+ "range",
1300
+ "fuel",
1301
+ "tool",
1302
+ "partner",
1303
+ "load",
1304
+ "entrance",
1305
+ "deposit",
1306
+ "hate",
1307
+ "article",
1308
+ "video",
1309
+ "summer",
1310
+ "feature",
1311
+ "extreme",
1312
+ "mobile",
1313
+ "hospital",
1314
+ "flight",
1315
+ "fall",
1316
+ "pension",
1317
+ "piano",
1318
+ "fail",
1319
+ "result",
1320
+ "rub",
1321
+ "gap",
1322
+ "system",
1323
+ "report",
1324
+ "suck",
1325
+ "ordinary",
1326
+ "wind",
1327
+ "nerve",
1328
+ "ask",
1329
+ "shine",
1330
+ "note",
1331
+ "line",
1332
+ "mom",
1333
+ "perception",
1334
+ "brother",
1335
+ "reference",
1336
+ "bend",
1337
+ "charge",
1338
+ "treat",
1339
+ "trick",
1340
+ "term",
1341
+ "homework",
1342
+ "bake",
1343
+ "bid",
1344
+ "status",
1345
+ "project",
1346
+ "strategy",
1347
+ "orange",
1348
+ "let",
1349
+ "enthusiasm",
1350
+ "parent",
1351
+ "concentrate",
1352
+ "device",
1353
+ "travel",
1354
+ "poetry",
1355
+ "business",
1356
+ "society",
1357
+ "kiss",
1358
+ "end",
1359
+ "vegetable",
1360
+ "employ",
1361
+ "schedule",
1362
+ "hour",
1363
+ "brave",
1364
+ "focus",
1365
+ "process",
1366
+ "movie",
1367
+ "illegal",
1368
+ "general",
1369
+ "coffee",
1370
+ "ad",
1371
+ "highway",
1372
+ "chemistry",
1373
+ "psychology",
1374
+ "hire",
1375
+ "bell",
1376
+ "conference",
1377
+ "relief",
1378
+ "show",
1379
+ "neat",
1380
+ "funny",
1381
+ "weight",
1382
+ "quality",
1383
+ "club",
1384
+ "daughter",
1385
+ "zone",
1386
+ "touch",
1387
+ "tonight",
1388
+ "shock",
1389
+ "burn",
1390
+ "excuse",
1391
+ "name",
1392
+ "survey",
1393
+ "landscape",
1394
+ "advance",
1395
+ "satisfaction",
1396
+ "bread",
1397
+ "disaster",
1398
+ "item",
1399
+ "hat",
1400
+ "prior",
1401
+ "shopping",
1402
+ "visit",
1403
+ "east",
1404
+ "photo",
1405
+ "home",
1406
+ "idea",
1407
+ "father",
1408
+ "comparison",
1409
+ "cat",
1410
+ "pipe",
1411
+ "winner",
1412
+ "count",
1413
+ "lake",
1414
+ "fight",
1415
+ "prize",
1416
+ "foundation",
1417
+ "dog",
1418
+ "keep",
1419
+ "ideal",
1420
+ "fan",
1421
+ "struggle",
1422
+ "peak",
1423
+ "safety",
1424
+ "solution",
1425
+ "hell",
1426
+ "conclusion",
1427
+ "population",
1428
+ "strain",
1429
+ "alarm",
1430
+ "measurement",
1431
+ "second",
1432
+ "train",
1433
+ "race",
1434
+ "due",
1435
+ "insurance",
1436
+ "boss",
1437
+ "tree",
1438
+ "monitor",
1439
+ "sick",
1440
+ "course",
1441
+ "drag",
1442
+ "appointment",
1443
+ "slice",
1444
+ "still",
1445
+ "care",
1446
+ "patience",
1447
+ "rich",
1448
+ "escape",
1449
+ "emotion",
1450
+ "royal",
1451
+ "female",
1452
+ "childhood",
1453
+ "government",
1454
+ "picture",
1455
+ "will",
1456
+ "sock",
1457
+ "big",
1458
+ "gate",
1459
+ "oil",
1460
+ "cross",
1461
+ "pin",
1462
+ "improvement",
1463
+ "championship",
1464
+ "silly",
1465
+ "help",
1466
+ "sky",
1467
+ "pitch",
1468
+ "man",
1469
+ "diamond",
1470
+ "most",
1471
+ "transition",
1472
+ "work",
1473
+ "science",
1474
+ "committee",
1475
+ "moment",
1476
+ "fix",
1477
+ "teaching",
1478
+ "dig",
1479
+ "specialist",
1480
+ "complex",
1481
+ "guide",
1482
+ "people",
1483
+ "dead",
1484
+ "voice",
1485
+ "original",
1486
+ "break",
1487
+ "topic",
1488
+ "data",
1489
+ "degree",
1490
+ "reading",
1491
+ "recording",
1492
+ "bunch",
1493
+ "reach",
1494
+ "judgment",
1495
+ "lie",
1496
+ "regular",
1497
+ "set",
1498
+ "painting",
1499
+ "mode",
1500
+ "list",
1501
+ "player",
1502
+ "bear",
1503
+ "north",
1504
+ "wonder",
1505
+ "carpet",
1506
+ "heavy",
1507
+ "officer",
1508
+ "negative",
1509
+ "clock",
1510
+ "unique",
1511
+ "baby",
1512
+ "pain",
1513
+ "assumption",
1514
+ "disk",
1515
+ "iron",
1516
+ "bill",
1517
+ "drawer",
1518
+ "look",
1519
+ "double",
1520
+ "mistake",
1521
+ "finish",
1522
+ "future",
1523
+ "brilliant",
1524
+ "contact",
1525
+ "math",
1526
+ "rice",
1527
+ "leave",
1528
+ "restaurant",
1529
+ "discount",
1530
+ "sex",
1531
+ "virus",
1532
+ "bit",
1533
+ "trust",
1534
+ "event",
1535
+ "wear",
1536
+ "juice",
1537
+ "failure",
1538
+ "bug",
1539
+ "context",
1540
+ "mud",
1541
+ "whole",
1542
+ "wrap",
1543
+ "intention",
1544
+ "draft",
1545
+ "pressure",
1546
+ "cake",
1547
+ "dark",
1548
+ "explanation",
1549
+ "space",
1550
+ "angle",
1551
+ "word",
1552
+ "efficiency",
1553
+ "management",
1554
+ "habit",
1555
+ "star",
1556
+ "chance",
1557
+ "finding",
1558
+ "transportation",
1559
+ "stand",
1560
+ "criticism",
1561
+ "flow",
1562
+ "door",
1563
+ "injury",
1564
+ "insect",
1565
+ "surprise",
1566
+ "apartment",
1567
+ ] # pylint: disable=line-too-long
1568
+
1569
+ # ISO 639-1 codes to language names.
1570
+ LANGUAGE_CODES = {
1571
+ "en": "English",
1572
+ "es": "Spanish",
1573
+ "pt": "Portuguese",
1574
+ "ar": "Arabic",
1575
+ "hi": "Hindi",
1576
+ "fr": "French",
1577
+ "ru": "Russian",
1578
+ "de": "German",
1579
+ "ja": "Japanese",
1580
+ "it": "Italian",
1581
+ "bn": "Bengali",
1582
+ "uk": "Ukrainian",
1583
+ "th": "Thai",
1584
+ "ur": "Urdu",
1585
+ "ta": "Tamil",
1586
+ "te": "Telugu",
1587
+ "bg": "Bulgarian",
1588
+ "ko": "Korean",
1589
+ "pl": "Polish",
1590
+ "he": "Hebrew",
1591
+ "fa": "Persian",
1592
+ "vi": "Vietnamese",
1593
+ "ne": "Nepali",
1594
+ "sw": "Swahili",
1595
+ "kn": "Kannada",
1596
+ "mr": "Marathi",
1597
+ "gu": "Gujarati",
1598
+ "pa": "Punjabi",
1599
+ "ml": "Malayalam",
1600
+ "fi": "Finnish",
1601
+ "sv": "Swedish",
1602
+ }
1603
+
1604
+ _ALPHABETS = "([A-Za-z])"
1605
+ _PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
1606
+ _SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
1607
+ _STARTERS = (
1608
+ r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
1609
+ )
1610
+ _ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
1611
+ _WEBSITES = "[.](com|net|org|io|gov|edu|me)"
1612
+ _DIGITS = "([0-9])"
1613
+ _MULTIPLE_DOTS = r"\.{2,}"
1614
+
1615
+
1616
+ def split_into_sentences(text):
1617
+ """Split the text into sentences.
1618
+
1619
+ Args:
1620
+ text: A string that consists of more than or equal to one sentences.
1621
+
1622
+ Returns:
1623
+ A list of strings where each string is a sentence.
1624
+ """
1625
+ text = " " + text + " "
1626
+ text = text.replace("\n", " ")
1627
+ text = re.sub(_PREFIXES, "\\1<prd>", text)
1628
+ text = re.sub(_WEBSITES, "<prd>\\1", text)
1629
+ text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
1630
+ text = re.sub(
1631
+ _MULTIPLE_DOTS,
1632
+ lambda match: "<prd>" * len(match.group(0)) + "<stop>",
1633
+ text,
1634
+ )
1635
+ if "Ph.D" in text:
1636
+ text = text.replace("Ph.D.", "Ph<prd>D<prd>")
1637
+ text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
1638
+ text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
1639
+ text = re.sub(
1640
+ _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
1641
+ "\\1<prd>\\2<prd>\\3<prd>",
1642
+ text,
1643
+ )
1644
+ text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
1645
+ text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
1646
+ text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
1647
+ text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
1648
+ if "”" in text:
1649
+ text = text.replace(".”", "”.")
1650
+ if '"' in text:
1651
+ text = text.replace('."', '".')
1652
+ if "!" in text:
1653
+ text = text.replace('!"', '"!')
1654
+ if "?" in text:
1655
+ text = text.replace('?"', '"?')
1656
+ text = text.replace(".", ".<stop>")
1657
+ text = text.replace("?", "?<stop>")
1658
+ text = text.replace("!", "!<stop>")
1659
+ text = text.replace("<prd>", ".")
1660
+ sentences = text.split("<stop>")
1661
+ sentences = [s.strip() for s in sentences]
1662
+ if sentences and not sentences[-1]:
1663
+ sentences = sentences[:-1]
1664
+ return sentences
1665
+
1666
+
1667
+ def count_words(text):
1668
+ """Counts the number of words."""
1669
+ tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
1670
+ tokens = tokenizer.tokenize(text)
1671
+ num_words = len(tokens)
1672
+ return num_words
1673
+
1674
+
1675
+ @functools.cache
1676
+ def _get_sentence_tokenizer():
1677
+ return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
1678
+
1679
+
1680
+ def count_sentences(text):
1681
+ """Count the number of sentences."""
1682
+ tokenizer = _get_sentence_tokenizer()
1683
+ tokenized_sentences = tokenizer.tokenize(text)
1684
+ return len(tokenized_sentences)
1685
+
1686
+
1687
+ def generate_keywords(num_keywords):
1688
+ """Randomly generates a few keywords."""
1689
+ return random.sample(WORD_LIST, k=num_keywords)