evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  11. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  12. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  13. evalscope/benchmarks/ifeval/__init__.py +0 -0
  14. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  15. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  16. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  17. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  18. evalscope/benchmarks/ifeval/utils.py +134 -0
  19. evalscope/benchmarks/iquiz/__init__.py +0 -0
  20. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  23. evalscope/benchmarks/race/race_adapter.py +4 -73
  24. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  25. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  26. evalscope/cli/cli.py +2 -0
  27. evalscope/cli/start_app.py +29 -0
  28. evalscope/collections/evaluator.py +82 -62
  29. evalscope/collections/sampler.py +47 -41
  30. evalscope/collections/schema.py +14 -10
  31. evalscope/constants.py +4 -0
  32. evalscope/evaluator/evaluator.py +22 -13
  33. evalscope/metrics/__init__.py +2 -5
  34. evalscope/metrics/metrics.py +11 -2
  35. evalscope/metrics/named_metrics.py +17 -0
  36. evalscope/models/server_adapter.py +11 -4
  37. evalscope/perf/__init__.py +1 -0
  38. evalscope/perf/main.py +0 -1
  39. evalscope/perf/plugin/api/custom_api.py +1 -1
  40. evalscope/perf/plugin/api/openai_api.py +1 -1
  41. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  42. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  43. evalscope/report/__init__.py +5 -0
  44. evalscope/report/app.py +506 -0
  45. evalscope/report/combinator.py +73 -0
  46. evalscope/report/generator.py +80 -0
  47. evalscope/report/utils.py +133 -0
  48. evalscope/run.py +16 -11
  49. evalscope/summarizer.py +1 -1
  50. evalscope/utils/chat_service.py +1 -1
  51. evalscope/utils/logger.py +1 -0
  52. evalscope/utils/model_utils.py +5 -2
  53. evalscope/version.py +2 -2
  54. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
  55. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
  56. tests/cli/test_collection.py +11 -7
  57. tests/cli/test_run.py +13 -4
  58. evalscope/tools/__init__.py +0 -1
  59. evalscope/tools/combine_reports.py +0 -133
  60. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  61. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  62. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  63. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  64. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  65. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1670 @@
1
+ # Copyright 2023 The Google Research Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Utility library of instructions."""
15
+
16
+ import functools
17
+ import immutabledict
18
+ import nltk
19
+ import os
20
+ import random
21
+ import re
22
+
23
+ RANK = os.environ.get('LOCAL_RANK', '0')
24
+
25
+ WORD_LIST = [
26
+ 'western',
27
+ 'sentence',
28
+ 'signal',
29
+ 'dump',
30
+ 'spot',
31
+ 'opposite',
32
+ 'bottom',
33
+ 'potato',
34
+ 'administration',
35
+ 'working',
36
+ 'welcome',
37
+ 'morning',
38
+ 'good',
39
+ 'agency',
40
+ 'primary',
41
+ 'wish',
42
+ 'responsibility',
43
+ 'press',
44
+ 'problem',
45
+ 'president',
46
+ 'steal',
47
+ 'brush',
48
+ 'read',
49
+ 'type',
50
+ 'beat',
51
+ 'trainer',
52
+ 'growth',
53
+ 'lock',
54
+ 'bone',
55
+ 'case',
56
+ 'equal',
57
+ 'comfortable',
58
+ 'region',
59
+ 'replacement',
60
+ 'performance',
61
+ 'mate',
62
+ 'walk',
63
+ 'medicine',
64
+ 'film',
65
+ 'thing',
66
+ 'rock',
67
+ 'tap',
68
+ 'total',
69
+ 'competition',
70
+ 'ease',
71
+ 'south',
72
+ 'establishment',
73
+ 'gather',
74
+ 'parking',
75
+ 'world',
76
+ 'plenty',
77
+ 'breath',
78
+ 'claim',
79
+ 'alcohol',
80
+ 'trade',
81
+ 'dear',
82
+ 'highlight',
83
+ 'street',
84
+ 'matter',
85
+ 'decision',
86
+ 'mess',
87
+ 'agreement',
88
+ 'studio',
89
+ 'coach',
90
+ 'assist',
91
+ 'brain',
92
+ 'wing',
93
+ 'style',
94
+ 'private',
95
+ 'top',
96
+ 'brown',
97
+ 'leg',
98
+ 'buy',
99
+ 'procedure',
100
+ 'method',
101
+ 'speed',
102
+ 'high',
103
+ 'company',
104
+ 'valuable',
105
+ 'pie',
106
+ 'analyst',
107
+ 'session',
108
+ 'pattern',
109
+ 'district',
110
+ 'pleasure',
111
+ 'dinner',
112
+ 'swimming',
113
+ 'joke',
114
+ 'order',
115
+ 'plate',
116
+ 'department',
117
+ 'motor',
118
+ 'cell',
119
+ 'spend',
120
+ 'cabinet',
121
+ 'difference',
122
+ 'power',
123
+ 'examination',
124
+ 'engine',
125
+ 'horse',
126
+ 'dimension',
127
+ 'pay',
128
+ 'toe',
129
+ 'curve',
130
+ 'literature',
131
+ 'bother',
132
+ 'fire',
133
+ 'possibility',
134
+ 'debate',
135
+ 'activity',
136
+ 'passage',
137
+ 'hello',
138
+ 'cycle',
139
+ 'background',
140
+ 'quiet',
141
+ 'author',
142
+ 'effect',
143
+ 'actor',
144
+ 'page',
145
+ 'bicycle',
146
+ 'error',
147
+ 'throat',
148
+ 'attack',
149
+ 'character',
150
+ 'phone',
151
+ 'tea',
152
+ 'increase',
153
+ 'outcome',
154
+ 'file',
155
+ 'specific',
156
+ 'inspector',
157
+ 'internal',
158
+ 'potential',
159
+ 'staff',
160
+ 'building',
161
+ 'employer',
162
+ 'shoe',
163
+ 'hand',
164
+ 'direction',
165
+ 'garden',
166
+ 'purchase',
167
+ 'interview',
168
+ 'study',
169
+ 'recognition',
170
+ 'member',
171
+ 'spiritual',
172
+ 'oven',
173
+ 'sandwich',
174
+ 'weird',
175
+ 'passenger',
176
+ 'particular',
177
+ 'response',
178
+ 'reaction',
179
+ 'size',
180
+ 'variation',
181
+ 'a',
182
+ 'cancel',
183
+ 'candy',
184
+ 'exit',
185
+ 'guest',
186
+ 'condition',
187
+ 'fly',
188
+ 'price',
189
+ 'weakness',
190
+ 'convert',
191
+ 'hotel',
192
+ 'great',
193
+ 'mouth',
194
+ 'mind',
195
+ 'song',
196
+ 'sugar',
197
+ 'suspect',
198
+ 'telephone',
199
+ 'ear',
200
+ 'roof',
201
+ 'paint',
202
+ 'refrigerator',
203
+ 'organization',
204
+ 'jury',
205
+ 'reward',
206
+ 'engineering',
207
+ 'day',
208
+ 'possession',
209
+ 'crew',
210
+ 'bar',
211
+ 'road',
212
+ 'description',
213
+ 'celebration',
214
+ 'score',
215
+ 'mark',
216
+ 'letter',
217
+ 'shower',
218
+ 'suggestion',
219
+ 'sir',
220
+ 'luck',
221
+ 'national',
222
+ 'progress',
223
+ 'hall',
224
+ 'stroke',
225
+ 'theory',
226
+ 'offer',
227
+ 'story',
228
+ 'tax',
229
+ 'definition',
230
+ 'history',
231
+ 'ride',
232
+ 'medium',
233
+ 'opening',
234
+ 'glass',
235
+ 'elevator',
236
+ 'stomach',
237
+ 'question',
238
+ 'ability',
239
+ 'leading',
240
+ 'village',
241
+ 'computer',
242
+ 'city',
243
+ 'grand',
244
+ 'confidence',
245
+ 'candle',
246
+ 'priest',
247
+ 'recommendation',
248
+ 'point',
249
+ 'necessary',
250
+ 'body',
251
+ 'desk',
252
+ 'secret',
253
+ 'horror',
254
+ 'noise',
255
+ 'culture',
256
+ 'warning',
257
+ 'water',
258
+ 'round',
259
+ 'diet',
260
+ 'flower',
261
+ 'bus',
262
+ 'tough',
263
+ 'permission',
264
+ 'week',
265
+ 'prompt',
266
+ 'connection',
267
+ 'abuse',
268
+ 'height',
269
+ 'save',
270
+ 'corner',
271
+ 'border',
272
+ 'stress',
273
+ 'drive',
274
+ 'stop',
275
+ 'rip',
276
+ 'meal',
277
+ 'listen',
278
+ 'confusion',
279
+ 'girlfriend',
280
+ 'living',
281
+ 'relation',
282
+ 'significance',
283
+ 'plan',
284
+ 'creative',
285
+ 'atmosphere',
286
+ 'blame',
287
+ 'invite',
288
+ 'housing',
289
+ 'paper',
290
+ 'drink',
291
+ 'roll',
292
+ 'silver',
293
+ 'drunk',
294
+ 'age',
295
+ 'damage',
296
+ 'smoke',
297
+ 'environment',
298
+ 'pack',
299
+ 'savings',
300
+ 'influence',
301
+ 'tourist',
302
+ 'rain',
303
+ 'post',
304
+ 'sign',
305
+ 'grandmother',
306
+ 'run',
307
+ 'profit',
308
+ 'push',
309
+ 'clerk',
310
+ 'final',
311
+ 'wine',
312
+ 'swim',
313
+ 'pause',
314
+ 'stuff',
315
+ 'singer',
316
+ 'funeral',
317
+ 'average',
318
+ 'source',
319
+ 'scene',
320
+ 'tradition',
321
+ 'personal',
322
+ 'snow',
323
+ 'nobody',
324
+ 'distance',
325
+ 'sort',
326
+ 'sensitive',
327
+ 'animal',
328
+ 'major',
329
+ 'negotiation',
330
+ 'click',
331
+ 'mood',
332
+ 'period',
333
+ 'arrival',
334
+ 'expression',
335
+ 'holiday',
336
+ 'repeat',
337
+ 'dust',
338
+ 'closet',
339
+ 'gold',
340
+ 'bad',
341
+ 'sail',
342
+ 'combination',
343
+ 'clothes',
344
+ 'emphasis',
345
+ 'duty',
346
+ 'black',
347
+ 'step',
348
+ 'school',
349
+ 'jump',
350
+ 'document',
351
+ 'professional',
352
+ 'lip',
353
+ 'chemical',
354
+ 'front',
355
+ 'wake',
356
+ 'while',
357
+ 'inside',
358
+ 'watch',
359
+ 'row',
360
+ 'subject',
361
+ 'penalty',
362
+ 'balance',
363
+ 'possible',
364
+ 'adult',
365
+ 'aside',
366
+ 'sample',
367
+ 'appeal',
368
+ 'wedding',
369
+ 'depth',
370
+ 'king',
371
+ 'award',
372
+ 'wife',
373
+ 'blow',
374
+ 'site',
375
+ 'camp',
376
+ 'music',
377
+ 'safe',
378
+ 'gift',
379
+ 'fault',
380
+ 'guess',
381
+ 'act',
382
+ 'shame',
383
+ 'drama',
384
+ 'capital',
385
+ 'exam',
386
+ 'stupid',
387
+ 'record',
388
+ 'sound',
389
+ 'swing',
390
+ 'novel',
391
+ 'minimum',
392
+ 'ratio',
393
+ 'machine',
394
+ 'shape',
395
+ 'lead',
396
+ 'operation',
397
+ 'salary',
398
+ 'cloud',
399
+ 'affair',
400
+ 'hit',
401
+ 'chapter',
402
+ 'stage',
403
+ 'quantity',
404
+ 'access',
405
+ 'army',
406
+ 'chain',
407
+ 'traffic',
408
+ 'kick',
409
+ 'analysis',
410
+ 'airport',
411
+ 'time',
412
+ 'vacation',
413
+ 'philosophy',
414
+ 'ball',
415
+ 'chest',
416
+ 'thanks',
417
+ 'place',
418
+ 'mountain',
419
+ 'advertising',
420
+ 'red',
421
+ 'past',
422
+ 'rent',
423
+ 'return',
424
+ 'tour',
425
+ 'house',
426
+ 'construction',
427
+ 'net',
428
+ 'native',
429
+ 'war',
430
+ 'figure',
431
+ 'fee',
432
+ 'spray',
433
+ 'user',
434
+ 'dirt',
435
+ 'shot',
436
+ 'task',
437
+ 'stick',
438
+ 'friend',
439
+ 'software',
440
+ 'promotion',
441
+ 'interaction',
442
+ 'surround',
443
+ 'block',
444
+ 'purpose',
445
+ 'practice',
446
+ 'conflict',
447
+ 'routine',
448
+ 'requirement',
449
+ 'bonus',
450
+ 'hole',
451
+ 'state',
452
+ 'junior',
453
+ 'sweet',
454
+ 'catch',
455
+ 'tear',
456
+ 'fold',
457
+ 'wall',
458
+ 'editor',
459
+ 'life',
460
+ 'position',
461
+ 'pound',
462
+ 'respect',
463
+ 'bathroom',
464
+ 'coat',
465
+ 'script',
466
+ 'job',
467
+ 'teach',
468
+ 'birth',
469
+ 'view',
470
+ 'resolve',
471
+ 'theme',
472
+ 'employee',
473
+ 'doubt',
474
+ 'market',
475
+ 'education',
476
+ 'serve',
477
+ 'recover',
478
+ 'tone',
479
+ 'harm',
480
+ 'miss',
481
+ 'union',
482
+ 'understanding',
483
+ 'cow',
484
+ 'river',
485
+ 'association',
486
+ 'concept',
487
+ 'training',
488
+ 'recipe',
489
+ 'relationship',
490
+ 'reserve',
491
+ 'depression',
492
+ 'proof',
493
+ 'hair',
494
+ 'revenue',
495
+ 'independent',
496
+ 'lift',
497
+ 'assignment',
498
+ 'temporary',
499
+ 'amount',
500
+ 'loss',
501
+ 'edge',
502
+ 'track',
503
+ 'check',
504
+ 'rope',
505
+ 'estimate',
506
+ 'pollution',
507
+ 'stable',
508
+ 'message',
509
+ 'delivery',
510
+ 'perspective',
511
+ 'mirror',
512
+ 'assistant',
513
+ 'representative',
514
+ 'witness',
515
+ 'nature',
516
+ 'judge',
517
+ 'fruit',
518
+ 'tip',
519
+ 'devil',
520
+ 'town',
521
+ 'emergency',
522
+ 'upper',
523
+ 'drop',
524
+ 'stay',
525
+ 'human',
526
+ 'neck',
527
+ 'speaker',
528
+ 'network',
529
+ 'sing',
530
+ 'resist',
531
+ 'league',
532
+ 'trip',
533
+ 'signature',
534
+ 'lawyer',
535
+ 'importance',
536
+ 'gas',
537
+ 'choice',
538
+ 'engineer',
539
+ 'success',
540
+ 'part',
541
+ 'external',
542
+ 'worker',
543
+ 'simple',
544
+ 'quarter',
545
+ 'student',
546
+ 'heart',
547
+ 'pass',
548
+ 'spite',
549
+ 'shift',
550
+ 'rough',
551
+ 'lady',
552
+ 'grass',
553
+ 'community',
554
+ 'garage',
555
+ 'youth',
556
+ 'standard',
557
+ 'skirt',
558
+ 'promise',
559
+ 'blind',
560
+ 'television',
561
+ 'disease',
562
+ 'commission',
563
+ 'positive',
564
+ 'energy',
565
+ 'calm',
566
+ 'presence',
567
+ 'tune',
568
+ 'basis',
569
+ 'preference',
570
+ 'head',
571
+ 'common',
572
+ 'cut',
573
+ 'somewhere',
574
+ 'presentation',
575
+ 'current',
576
+ 'thought',
577
+ 'revolution',
578
+ 'effort',
579
+ 'master',
580
+ 'implement',
581
+ 'republic',
582
+ 'floor',
583
+ 'principle',
584
+ 'stranger',
585
+ 'shoulder',
586
+ 'grade',
587
+ 'button',
588
+ 'tennis',
589
+ 'police',
590
+ 'collection',
591
+ 'account',
592
+ 'register',
593
+ 'glove',
594
+ 'divide',
595
+ 'professor',
596
+ 'chair',
597
+ 'priority',
598
+ 'combine',
599
+ 'peace',
600
+ 'extension',
601
+ 'maybe',
602
+ 'evening',
603
+ 'frame',
604
+ 'sister',
605
+ 'wave',
606
+ 'code',
607
+ 'application',
608
+ 'mouse',
609
+ 'match',
610
+ 'counter',
611
+ 'bottle',
612
+ 'half',
613
+ 'cheek',
614
+ 'resolution',
615
+ 'back',
616
+ 'knowledge',
617
+ 'make',
618
+ 'discussion',
619
+ 'screw',
620
+ 'length',
621
+ 'accident',
622
+ 'battle',
623
+ 'dress',
624
+ 'knee',
625
+ 'log',
626
+ 'package',
627
+ 'it',
628
+ 'turn',
629
+ 'hearing',
630
+ 'newspaper',
631
+ 'layer',
632
+ 'wealth',
633
+ 'profile',
634
+ 'imagination',
635
+ 'answer',
636
+ 'weekend',
637
+ 'teacher',
638
+ 'appearance',
639
+ 'meet',
640
+ 'bike',
641
+ 'rise',
642
+ 'belt',
643
+ 'crash',
644
+ 'bowl',
645
+ 'equivalent',
646
+ 'support',
647
+ 'image',
648
+ 'poem',
649
+ 'risk',
650
+ 'excitement',
651
+ 'remote',
652
+ 'secretary',
653
+ 'public',
654
+ 'produce',
655
+ 'plane',
656
+ 'display',
657
+ 'money',
658
+ 'sand',
659
+ 'situation',
660
+ 'punch',
661
+ 'customer',
662
+ 'title',
663
+ 'shake',
664
+ 'mortgage',
665
+ 'option',
666
+ 'number',
667
+ 'pop',
668
+ 'window',
669
+ 'extent',
670
+ 'nothing',
671
+ 'experience',
672
+ 'opinion',
673
+ 'departure',
674
+ 'dance',
675
+ 'indication',
676
+ 'boy',
677
+ 'material',
678
+ 'band',
679
+ 'leader',
680
+ 'sun',
681
+ 'beautiful',
682
+ 'muscle',
683
+ 'farmer',
684
+ 'variety',
685
+ 'fat',
686
+ 'handle',
687
+ 'director',
688
+ 'opportunity',
689
+ 'calendar',
690
+ 'outside',
691
+ 'pace',
692
+ 'bath',
693
+ 'fish',
694
+ 'consequence',
695
+ 'put',
696
+ 'owner',
697
+ 'go',
698
+ 'doctor',
699
+ 'information',
700
+ 'share',
701
+ 'hurt',
702
+ 'protection',
703
+ 'career',
704
+ 'finance',
705
+ 'force',
706
+ 'golf',
707
+ 'garbage',
708
+ 'aspect',
709
+ 'kid',
710
+ 'food',
711
+ 'boot',
712
+ 'milk',
713
+ 'respond',
714
+ 'objective',
715
+ 'reality',
716
+ 'raw',
717
+ 'ring',
718
+ 'mall',
719
+ 'one',
720
+ 'impact',
721
+ 'area',
722
+ 'news',
723
+ 'international',
724
+ 'series',
725
+ 'impress',
726
+ 'mother',
727
+ 'shelter',
728
+ 'strike',
729
+ 'loan',
730
+ 'month',
731
+ 'seat',
732
+ 'anything',
733
+ 'entertainment',
734
+ 'familiar',
735
+ 'clue',
736
+ 'year',
737
+ 'glad',
738
+ 'supermarket',
739
+ 'natural',
740
+ 'god',
741
+ 'cost',
742
+ 'conversation',
743
+ 'tie',
744
+ 'ruin',
745
+ 'comfort',
746
+ 'earth',
747
+ 'storm',
748
+ 'percentage',
749
+ 'assistance',
750
+ 'budget',
751
+ 'strength',
752
+ 'beginning',
753
+ 'sleep',
754
+ 'other',
755
+ 'young',
756
+ 'unit',
757
+ 'fill',
758
+ 'store',
759
+ 'desire',
760
+ 'hide',
761
+ 'value',
762
+ 'cup',
763
+ 'maintenance',
764
+ 'nurse',
765
+ 'function',
766
+ 'tower',
767
+ 'role',
768
+ 'class',
769
+ 'camera',
770
+ 'database',
771
+ 'panic',
772
+ 'nation',
773
+ 'basket',
774
+ 'ice',
775
+ 'art',
776
+ 'spirit',
777
+ 'chart',
778
+ 'exchange',
779
+ 'feedback',
780
+ 'statement',
781
+ 'reputation',
782
+ 'search',
783
+ 'hunt',
784
+ 'exercise',
785
+ 'nasty',
786
+ 'notice',
787
+ 'male',
788
+ 'yard',
789
+ 'annual',
790
+ 'collar',
791
+ 'date',
792
+ 'platform',
793
+ 'plant',
794
+ 'fortune',
795
+ 'passion',
796
+ 'friendship',
797
+ 'spread',
798
+ 'cancer',
799
+ 'ticket',
800
+ 'attitude',
801
+ 'island',
802
+ 'active',
803
+ 'object',
804
+ 'service',
805
+ 'buyer',
806
+ 'bite',
807
+ 'card',
808
+ 'face',
809
+ 'steak',
810
+ 'proposal',
811
+ 'patient',
812
+ 'heat',
813
+ 'rule',
814
+ 'resident',
815
+ 'broad',
816
+ 'politics',
817
+ 'west',
818
+ 'knife',
819
+ 'expert',
820
+ 'girl',
821
+ 'design',
822
+ 'salt',
823
+ 'baseball',
824
+ 'grab',
825
+ 'inspection',
826
+ 'cousin',
827
+ 'couple',
828
+ 'magazine',
829
+ 'cook',
830
+ 'dependent',
831
+ 'security',
832
+ 'chicken',
833
+ 'version',
834
+ 'currency',
835
+ 'ladder',
836
+ 'scheme',
837
+ 'kitchen',
838
+ 'employment',
839
+ 'local',
840
+ 'attention',
841
+ 'manager',
842
+ 'fact',
843
+ 'cover',
844
+ 'sad',
845
+ 'guard',
846
+ 'relative',
847
+ 'county',
848
+ 'rate',
849
+ 'lunch',
850
+ 'program',
851
+ 'initiative',
852
+ 'gear',
853
+ 'bridge',
854
+ 'breast',
855
+ 'talk',
856
+ 'dish',
857
+ 'guarantee',
858
+ 'beer',
859
+ 'vehicle',
860
+ 'reception',
861
+ 'woman',
862
+ 'substance',
863
+ 'copy',
864
+ 'lecture',
865
+ 'advantage',
866
+ 'park',
867
+ 'cold',
868
+ 'death',
869
+ 'mix',
870
+ 'hold',
871
+ 'scale',
872
+ 'tomorrow',
873
+ 'blood',
874
+ 'request',
875
+ 'green',
876
+ 'cookie',
877
+ 'church',
878
+ 'strip',
879
+ 'forever',
880
+ 'beyond',
881
+ 'debt',
882
+ 'tackle',
883
+ 'wash',
884
+ 'following',
885
+ 'feel',
886
+ 'maximum',
887
+ 'sector',
888
+ 'sea',
889
+ 'property',
890
+ 'economics',
891
+ 'menu',
892
+ 'bench',
893
+ 'try',
894
+ 'language',
895
+ 'start',
896
+ 'call',
897
+ 'solid',
898
+ 'address',
899
+ 'income',
900
+ 'foot',
901
+ 'senior',
902
+ 'honey',
903
+ 'few',
904
+ 'mixture',
905
+ 'cash',
906
+ 'grocery',
907
+ 'link',
908
+ 'map',
909
+ 'form',
910
+ 'factor',
911
+ 'pot',
912
+ 'model',
913
+ 'writer',
914
+ 'farm',
915
+ 'winter',
916
+ 'skill',
917
+ 'anywhere',
918
+ 'birthday',
919
+ 'policy',
920
+ 'release',
921
+ 'husband',
922
+ 'lab',
923
+ 'hurry',
924
+ 'mail',
925
+ 'equipment',
926
+ 'sink',
927
+ 'pair',
928
+ 'driver',
929
+ 'consideration',
930
+ 'leather',
931
+ 'skin',
932
+ 'blue',
933
+ 'boat',
934
+ 'sale',
935
+ 'brick',
936
+ 'two',
937
+ 'feed',
938
+ 'square',
939
+ 'dot',
940
+ 'rush',
941
+ 'dream',
942
+ 'location',
943
+ 'afternoon',
944
+ 'manufacturer',
945
+ 'control',
946
+ 'occasion',
947
+ 'trouble',
948
+ 'introduction',
949
+ 'advice',
950
+ 'bet',
951
+ 'eat',
952
+ 'kill',
953
+ 'category',
954
+ 'manner',
955
+ 'office',
956
+ 'estate',
957
+ 'pride',
958
+ 'awareness',
959
+ 'slip',
960
+ 'crack',
961
+ 'client',
962
+ 'nail',
963
+ 'shoot',
964
+ 'membership',
965
+ 'soft',
966
+ 'anybody',
967
+ 'web',
968
+ 'official',
969
+ 'individual',
970
+ 'pizza',
971
+ 'interest',
972
+ 'bag',
973
+ 'spell',
974
+ 'profession',
975
+ 'queen',
976
+ 'deal',
977
+ 'resource',
978
+ 'ship',
979
+ 'guy',
980
+ 'chocolate',
981
+ 'joint',
982
+ 'formal',
983
+ 'upstairs',
984
+ 'car',
985
+ 'resort',
986
+ 'abroad',
987
+ 'dealer',
988
+ 'associate',
989
+ 'finger',
990
+ 'surgery',
991
+ 'comment',
992
+ 'team',
993
+ 'detail',
994
+ 'crazy',
995
+ 'path',
996
+ 'tale',
997
+ 'initial',
998
+ 'arm',
999
+ 'radio',
1000
+ 'demand',
1001
+ 'single',
1002
+ 'draw',
1003
+ 'yellow',
1004
+ 'contest',
1005
+ 'piece',
1006
+ 'quote',
1007
+ 'pull',
1008
+ 'commercial',
1009
+ 'shirt',
1010
+ 'contribution',
1011
+ 'cream',
1012
+ 'channel',
1013
+ 'suit',
1014
+ 'discipline',
1015
+ 'instruction',
1016
+ 'concert',
1017
+ 'speech',
1018
+ 'low',
1019
+ 'effective',
1020
+ 'hang',
1021
+ 'scratch',
1022
+ 'industry',
1023
+ 'breakfast',
1024
+ 'lay',
1025
+ 'join',
1026
+ 'metal',
1027
+ 'bedroom',
1028
+ 'minute',
1029
+ 'product',
1030
+ 'rest',
1031
+ 'temperature',
1032
+ 'many',
1033
+ 'give',
1034
+ 'argument',
1035
+ 'print',
1036
+ 'purple',
1037
+ 'laugh',
1038
+ 'health',
1039
+ 'credit',
1040
+ 'investment',
1041
+ 'sell',
1042
+ 'setting',
1043
+ 'lesson',
1044
+ 'egg',
1045
+ 'middle',
1046
+ 'marriage',
1047
+ 'level',
1048
+ 'evidence',
1049
+ 'phrase',
1050
+ 'love',
1051
+ 'self',
1052
+ 'benefit',
1053
+ 'guidance',
1054
+ 'affect',
1055
+ 'you',
1056
+ 'dad',
1057
+ 'anxiety',
1058
+ 'special',
1059
+ 'boyfriend',
1060
+ 'test',
1061
+ 'blank',
1062
+ 'payment',
1063
+ 'soup',
1064
+ 'obligation',
1065
+ 'reply',
1066
+ 'smile',
1067
+ 'deep',
1068
+ 'complaint',
1069
+ 'addition',
1070
+ 'review',
1071
+ 'box',
1072
+ 'towel',
1073
+ 'minor',
1074
+ 'fun',
1075
+ 'soil',
1076
+ 'issue',
1077
+ 'cigarette',
1078
+ 'internet',
1079
+ 'gain',
1080
+ 'tell',
1081
+ 'entry',
1082
+ 'spare',
1083
+ 'incident',
1084
+ 'family',
1085
+ 'refuse',
1086
+ 'branch',
1087
+ 'can',
1088
+ 'pen',
1089
+ 'grandfather',
1090
+ 'constant',
1091
+ 'tank',
1092
+ 'uncle',
1093
+ 'climate',
1094
+ 'ground',
1095
+ 'volume',
1096
+ 'communication',
1097
+ 'kind',
1098
+ 'poet',
1099
+ 'child',
1100
+ 'screen',
1101
+ 'mine',
1102
+ 'quit',
1103
+ 'gene',
1104
+ 'lack',
1105
+ 'charity',
1106
+ 'memory',
1107
+ 'tooth',
1108
+ 'fear',
1109
+ 'mention',
1110
+ 'marketing',
1111
+ 'reveal',
1112
+ 'reason',
1113
+ 'court',
1114
+ 'season',
1115
+ 'freedom',
1116
+ 'land',
1117
+ 'sport',
1118
+ 'audience',
1119
+ 'classroom',
1120
+ 'law',
1121
+ 'hook',
1122
+ 'win',
1123
+ 'carry',
1124
+ 'eye',
1125
+ 'smell',
1126
+ 'distribution',
1127
+ 'research',
1128
+ 'country',
1129
+ 'dare',
1130
+ 'hope',
1131
+ 'whereas',
1132
+ 'stretch',
1133
+ 'library',
1134
+ 'if',
1135
+ 'delay',
1136
+ 'college',
1137
+ 'plastic',
1138
+ 'book',
1139
+ 'present',
1140
+ 'use',
1141
+ 'worry',
1142
+ 'champion',
1143
+ 'goal',
1144
+ 'economy',
1145
+ 'march',
1146
+ 'election',
1147
+ 'reflection',
1148
+ 'midnight',
1149
+ 'slide',
1150
+ 'inflation',
1151
+ 'action',
1152
+ 'challenge',
1153
+ 'guitar',
1154
+ 'coast',
1155
+ 'apple',
1156
+ 'campaign',
1157
+ 'field',
1158
+ 'jacket',
1159
+ 'sense',
1160
+ 'way',
1161
+ 'visual',
1162
+ 'remove',
1163
+ 'weather',
1164
+ 'trash',
1165
+ 'cable',
1166
+ 'regret',
1167
+ 'buddy',
1168
+ 'beach',
1169
+ 'historian',
1170
+ 'courage',
1171
+ 'sympathy',
1172
+ 'truck',
1173
+ 'tension',
1174
+ 'permit',
1175
+ 'nose',
1176
+ 'bed',
1177
+ 'son',
1178
+ 'person',
1179
+ 'base',
1180
+ 'meat',
1181
+ 'usual',
1182
+ 'air',
1183
+ 'meeting',
1184
+ 'worth',
1185
+ 'game',
1186
+ 'independence',
1187
+ 'physical',
1188
+ 'brief',
1189
+ 'play',
1190
+ 'raise',
1191
+ 'board',
1192
+ 'she',
1193
+ 'key',
1194
+ 'writing',
1195
+ 'pick',
1196
+ 'command',
1197
+ 'party',
1198
+ 'yesterday',
1199
+ 'spring',
1200
+ 'candidate',
1201
+ 'physics',
1202
+ 'university',
1203
+ 'concern',
1204
+ 'development',
1205
+ 'change',
1206
+ 'string',
1207
+ 'target',
1208
+ 'instance',
1209
+ 'room',
1210
+ 'bitter',
1211
+ 'bird',
1212
+ 'football',
1213
+ 'normal',
1214
+ 'split',
1215
+ 'impression',
1216
+ 'wood',
1217
+ 'long',
1218
+ 'meaning',
1219
+ 'stock',
1220
+ 'cap',
1221
+ 'leadership',
1222
+ 'media',
1223
+ 'ambition',
1224
+ 'fishing',
1225
+ 'essay',
1226
+ 'salad',
1227
+ 'repair',
1228
+ 'today',
1229
+ 'designer',
1230
+ 'night',
1231
+ 'bank',
1232
+ 'drawing',
1233
+ 'inevitable',
1234
+ 'phase',
1235
+ 'vast',
1236
+ 'chip',
1237
+ 'anger',
1238
+ 'switch',
1239
+ 'cry',
1240
+ 'twist',
1241
+ 'personality',
1242
+ 'attempt',
1243
+ 'storage',
1244
+ 'being',
1245
+ 'preparation',
1246
+ 'bat',
1247
+ 'selection',
1248
+ 'white',
1249
+ 'technology',
1250
+ 'contract',
1251
+ 'side',
1252
+ 'section',
1253
+ 'station',
1254
+ 'till',
1255
+ 'structure',
1256
+ 'tongue',
1257
+ 'taste',
1258
+ 'truth',
1259
+ 'difficulty',
1260
+ 'group',
1261
+ 'limit',
1262
+ 'main',
1263
+ 'move',
1264
+ 'feeling',
1265
+ 'light',
1266
+ 'example',
1267
+ 'mission',
1268
+ 'might',
1269
+ 'wait',
1270
+ 'wheel',
1271
+ 'shop',
1272
+ 'host',
1273
+ 'classic',
1274
+ 'alternative',
1275
+ 'cause',
1276
+ 'agent',
1277
+ 'consist',
1278
+ 'table',
1279
+ 'airline',
1280
+ 'text',
1281
+ 'pool',
1282
+ 'craft',
1283
+ 'range',
1284
+ 'fuel',
1285
+ 'tool',
1286
+ 'partner',
1287
+ 'load',
1288
+ 'entrance',
1289
+ 'deposit',
1290
+ 'hate',
1291
+ 'article',
1292
+ 'video',
1293
+ 'summer',
1294
+ 'feature',
1295
+ 'extreme',
1296
+ 'mobile',
1297
+ 'hospital',
1298
+ 'flight',
1299
+ 'fall',
1300
+ 'pension',
1301
+ 'piano',
1302
+ 'fail',
1303
+ 'result',
1304
+ 'rub',
1305
+ 'gap',
1306
+ 'system',
1307
+ 'report',
1308
+ 'suck',
1309
+ 'ordinary',
1310
+ 'wind',
1311
+ 'nerve',
1312
+ 'ask',
1313
+ 'shine',
1314
+ 'note',
1315
+ 'line',
1316
+ 'mom',
1317
+ 'perception',
1318
+ 'brother',
1319
+ 'reference',
1320
+ 'bend',
1321
+ 'charge',
1322
+ 'treat',
1323
+ 'trick',
1324
+ 'term',
1325
+ 'homework',
1326
+ 'bake',
1327
+ 'bid',
1328
+ 'status',
1329
+ 'project',
1330
+ 'strategy',
1331
+ 'orange',
1332
+ 'let',
1333
+ 'enthusiasm',
1334
+ 'parent',
1335
+ 'concentrate',
1336
+ 'device',
1337
+ 'travel',
1338
+ 'poetry',
1339
+ 'business',
1340
+ 'society',
1341
+ 'kiss',
1342
+ 'end',
1343
+ 'vegetable',
1344
+ 'employ',
1345
+ 'schedule',
1346
+ 'hour',
1347
+ 'brave',
1348
+ 'focus',
1349
+ 'process',
1350
+ 'movie',
1351
+ 'illegal',
1352
+ 'general',
1353
+ 'coffee',
1354
+ 'ad',
1355
+ 'highway',
1356
+ 'chemistry',
1357
+ 'psychology',
1358
+ 'hire',
1359
+ 'bell',
1360
+ 'conference',
1361
+ 'relief',
1362
+ 'show',
1363
+ 'neat',
1364
+ 'funny',
1365
+ 'weight',
1366
+ 'quality',
1367
+ 'club',
1368
+ 'daughter',
1369
+ 'zone',
1370
+ 'touch',
1371
+ 'tonight',
1372
+ 'shock',
1373
+ 'burn',
1374
+ 'excuse',
1375
+ 'name',
1376
+ 'survey',
1377
+ 'landscape',
1378
+ 'advance',
1379
+ 'satisfaction',
1380
+ 'bread',
1381
+ 'disaster',
1382
+ 'item',
1383
+ 'hat',
1384
+ 'prior',
1385
+ 'shopping',
1386
+ 'visit',
1387
+ 'east',
1388
+ 'photo',
1389
+ 'home',
1390
+ 'idea',
1391
+ 'father',
1392
+ 'comparison',
1393
+ 'cat',
1394
+ 'pipe',
1395
+ 'winner',
1396
+ 'count',
1397
+ 'lake',
1398
+ 'fight',
1399
+ 'prize',
1400
+ 'foundation',
1401
+ 'dog',
1402
+ 'keep',
1403
+ 'ideal',
1404
+ 'fan',
1405
+ 'struggle',
1406
+ 'peak',
1407
+ 'safety',
1408
+ 'solution',
1409
+ 'hell',
1410
+ 'conclusion',
1411
+ 'population',
1412
+ 'strain',
1413
+ 'alarm',
1414
+ 'measurement',
1415
+ 'second',
1416
+ 'train',
1417
+ 'race',
1418
+ 'due',
1419
+ 'insurance',
1420
+ 'boss',
1421
+ 'tree',
1422
+ 'monitor',
1423
+ 'sick',
1424
+ 'course',
1425
+ 'drag',
1426
+ 'appointment',
1427
+ 'slice',
1428
+ 'still',
1429
+ 'care',
1430
+ 'patience',
1431
+ 'rich',
1432
+ 'escape',
1433
+ 'emotion',
1434
+ 'royal',
1435
+ 'female',
1436
+ 'childhood',
1437
+ 'government',
1438
+ 'picture',
1439
+ 'will',
1440
+ 'sock',
1441
+ 'big',
1442
+ 'gate',
1443
+ 'oil',
1444
+ 'cross',
1445
+ 'pin',
1446
+ 'improvement',
1447
+ 'championship',
1448
+ 'silly',
1449
+ 'help',
1450
+ 'sky',
1451
+ 'pitch',
1452
+ 'man',
1453
+ 'diamond',
1454
+ 'most',
1455
+ 'transition',
1456
+ 'work',
1457
+ 'science',
1458
+ 'committee',
1459
+ 'moment',
1460
+ 'fix',
1461
+ 'teaching',
1462
+ 'dig',
1463
+ 'specialist',
1464
+ 'complex',
1465
+ 'guide',
1466
+ 'people',
1467
+ 'dead',
1468
+ 'voice',
1469
+ 'original',
1470
+ 'break',
1471
+ 'topic',
1472
+ 'data',
1473
+ 'degree',
1474
+ 'reading',
1475
+ 'recording',
1476
+ 'bunch',
1477
+ 'reach',
1478
+ 'judgment',
1479
+ 'lie',
1480
+ 'regular',
1481
+ 'set',
1482
+ 'painting',
1483
+ 'mode',
1484
+ 'list',
1485
+ 'player',
1486
+ 'bear',
1487
+ 'north',
1488
+ 'wonder',
1489
+ 'carpet',
1490
+ 'heavy',
1491
+ 'officer',
1492
+ 'negative',
1493
+ 'clock',
1494
+ 'unique',
1495
+ 'baby',
1496
+ 'pain',
1497
+ 'assumption',
1498
+ 'disk',
1499
+ 'iron',
1500
+ 'bill',
1501
+ 'drawer',
1502
+ 'look',
1503
+ 'double',
1504
+ 'mistake',
1505
+ 'finish',
1506
+ 'future',
1507
+ 'brilliant',
1508
+ 'contact',
1509
+ 'math',
1510
+ 'rice',
1511
+ 'leave',
1512
+ 'restaurant',
1513
+ 'discount',
1514
+ 'sex',
1515
+ 'virus',
1516
+ 'bit',
1517
+ 'trust',
1518
+ 'event',
1519
+ 'wear',
1520
+ 'juice',
1521
+ 'failure',
1522
+ 'bug',
1523
+ 'context',
1524
+ 'mud',
1525
+ 'whole',
1526
+ 'wrap',
1527
+ 'intention',
1528
+ 'draft',
1529
+ 'pressure',
1530
+ 'cake',
1531
+ 'dark',
1532
+ 'explanation',
1533
+ 'space',
1534
+ 'angle',
1535
+ 'word',
1536
+ 'efficiency',
1537
+ 'management',
1538
+ 'habit',
1539
+ 'star',
1540
+ 'chance',
1541
+ 'finding',
1542
+ 'transportation',
1543
+ 'stand',
1544
+ 'criticism',
1545
+ 'flow',
1546
+ 'door',
1547
+ 'injury',
1548
+ 'insect',
1549
+ 'surprise',
1550
+ 'apartment',
1551
+ ] # pylint: disable=line-too-long
1552
+
1553
+ # ISO 639-1 codes to language names.
1554
+ LANGUAGE_CODES = immutabledict.immutabledict({
1555
+ 'en': 'English',
1556
+ 'es': 'Spanish',
1557
+ 'pt': 'Portuguese',
1558
+ 'ar': 'Arabic',
1559
+ 'hi': 'Hindi',
1560
+ 'fr': 'French',
1561
+ 'ru': 'Russian',
1562
+ 'de': 'German',
1563
+ 'ja': 'Japanese',
1564
+ 'it': 'Italian',
1565
+ 'bn': 'Bengali',
1566
+ 'uk': 'Ukrainian',
1567
+ 'th': 'Thai',
1568
+ 'ur': 'Urdu',
1569
+ 'ta': 'Tamil',
1570
+ 'te': 'Telugu',
1571
+ 'bg': 'Bulgarian',
1572
+ 'ko': 'Korean',
1573
+ 'pl': 'Polish',
1574
+ 'he': 'Hebrew',
1575
+ 'fa': 'Persian',
1576
+ 'vi': 'Vietnamese',
1577
+ 'ne': 'Nepali',
1578
+ 'sw': 'Swahili',
1579
+ 'kn': 'Kannada',
1580
+ 'mr': 'Marathi',
1581
+ 'gu': 'Gujarati',
1582
+ 'pa': 'Punjabi',
1583
+ 'ml': 'Malayalam',
1584
+ 'fi': 'Finnish',
1585
+ })
1586
+
1587
+ _ALPHABETS = '([A-Za-z])'
1588
+ _PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
1589
+ _SUFFIXES = '(Inc|Ltd|Jr|Sr|Co)'
1590
+ _STARTERS = r'(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)' # noqa: E501
1591
+ _ACRONYMS = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'
1592
+ _WEBSITES = '[.](com|net|org|io|gov|edu|me)'
1593
+ _DIGITS = '([0-9])'
1594
+ _MULTIPLE_DOTS = r'\.{2,}'
1595
+
1596
+
1597
+ def split_into_sentences(text):
1598
+ """Split the text into sentences.
1599
+
1600
+ Args:
1601
+ text: A string that consists of more than or equal to one sentences.
1602
+
1603
+ Returns:
1604
+ A list of strings where each string is a sentence.
1605
+ """
1606
+ text = ' ' + text + ' '
1607
+ text = text.replace('\n', ' ')
1608
+ text = re.sub(_PREFIXES, '\\1<prd>', text)
1609
+ text = re.sub(_WEBSITES, '<prd>\\1', text)
1610
+ text = re.sub(_DIGITS + '[.]' + _DIGITS, '\\1<prd>\\2', text)
1611
+ text = re.sub(
1612
+ _MULTIPLE_DOTS,
1613
+ lambda match: '<prd>' * len(match.group(0)) + '<stop>',
1614
+ text,
1615
+ )
1616
+ if 'Ph.D' in text:
1617
+ text = text.replace('Ph.D.', 'Ph<prd>D<prd>')
1618
+ text = re.sub(r'\s' + _ALPHABETS + '[.] ', ' \\1<prd> ', text)
1619
+ text = re.sub(_ACRONYMS + ' ' + _STARTERS, '\\1<stop> \\2', text)
1620
+ text = re.sub(
1621
+ _ALPHABETS + '[.]' + _ALPHABETS + '[.]' + _ALPHABETS + '[.]',
1622
+ '\\1<prd>\\2<prd>\\3<prd>',
1623
+ text,
1624
+ )
1625
+ text = re.sub(_ALPHABETS + '[.]' + _ALPHABETS + '[.]', '\\1<prd>\\2<prd>', text)
1626
+ text = re.sub(' ' + _SUFFIXES + '[.] ' + _STARTERS, ' \\1<stop> \\2', text)
1627
+ text = re.sub(' ' + _SUFFIXES + '[.]', ' \\1<prd>', text)
1628
+ text = re.sub(' ' + _ALPHABETS + '[.]', ' \\1<prd>', text)
1629
+ if '”' in text:
1630
+ text = text.replace('.”', '”.')
1631
+ if '"' in text:
1632
+ text = text.replace('."', '".')
1633
+ if '!' in text:
1634
+ text = text.replace('!"', '"!')
1635
+ if '?' in text:
1636
+ text = text.replace('?"', '"?')
1637
+ text = text.replace('.', '.<stop>')
1638
+ text = text.replace('?', '?<stop>')
1639
+ text = text.replace('!', '!<stop>')
1640
+ text = text.replace('<prd>', '.')
1641
+ sentences = text.split('<stop>')
1642
+ sentences = [s.strip() for s in sentences]
1643
+ if sentences and not sentences[-1]:
1644
+ sentences = sentences[:-1]
1645
+ return sentences
1646
+
1647
+
1648
+ def count_words(text):
1649
+ """Counts the number of words."""
1650
+ tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
1651
+ tokens = tokenizer.tokenize(text)
1652
+ num_words = len(tokens)
1653
+ return num_words
1654
+
1655
+
1656
+ @functools.lru_cache(maxsize=None)
1657
+ def _get_sentence_tokenizer():
1658
+ return nltk.data.load('nltk:tokenizers/punkt/english.pickle')
1659
+
1660
+
1661
+ def count_sentences(text):
1662
+ """Count the number of sentences."""
1663
+ tokenizer = _get_sentence_tokenizer()
1664
+ tokenized_sentences = tokenizer.tokenize(text)
1665
+ return len(tokenized_sentences)
1666
+
1667
+
1668
+ def generate_keywords(num_keywords):
1669
+ """Randomly generates a few keywords."""
1670
+ return random.sample(WORD_LIST, k=num_keywords)