bot_twitter_ebooks 3.3.0 → 3.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/data/nouns.txt CHANGED
@@ -281,6 +281,7 @@ butcher
281
281
  butter
282
282
  button
283
283
  buzzard
284
+ c-clamp
284
285
  cabbage
285
286
  cabinet
286
287
  cable
@@ -334,7 +335,6 @@ cauliflower
334
335
  cause
335
336
  caution
336
337
  cave
337
- c-clamp
338
338
  cd
339
339
  ceiling
340
340
  celery
@@ -705,18 +705,18 @@ ethernet
705
705
  euphonium
706
706
  evening
707
707
  event
708
+ ex-husband
709
+ ex-wife
708
710
  examination
709
711
  example
710
712
  exchange
711
713
  exclamation
712
714
  exhaust
713
- ex-husband
714
715
  existence
715
716
  expansion
716
717
  experience
717
718
  expert
718
719
  explanation
719
- ex-wife
720
720
  eye
721
721
  eyebrow
722
722
  eyelash
@@ -1227,8 +1227,8 @@ millimeter
1227
1227
  millisecond
1228
1228
  mind
1229
1229
  mine
1230
- minibus
1231
1230
  mini-skirt
1231
+ minibus
1232
1232
  minister
1233
1233
  mint
1234
1234
  minute
@@ -1852,17 +1852,17 @@ stem
1852
1852
  step
1853
1853
  step-aunt
1854
1854
  step-brother
1855
- stepdaughter
1856
1855
  step-daughter
1857
1856
  step-father
1858
1857
  step-grandfather
1859
1858
  step-grandmother
1860
- stepmother
1861
1859
  step-mother
1862
1860
  step-sister
1863
- stepson
1864
1861
  step-son
1865
1862
  step-uncle
1863
+ stepdaughter
1864
+ stepmother
1865
+ stepson
1866
1866
  steven
1867
1867
  stew
1868
1868
  stick
@@ -1928,6 +1928,7 @@ swordfish
1928
1928
  sycamore
1929
1929
  syrup
1930
1930
  system
1931
+ t-shirt
1931
1932
  table
1932
1933
  tablecloth
1933
1934
  tabletop
@@ -1998,8 +1999,8 @@ toast
1998
1999
  toe
1999
2000
  toenail
2000
2001
  toilet
2001
- tomato
2002
2002
  tom-tom
2003
+ tomato
2003
2004
  ton
2004
2005
  tongue
2005
2006
  tooth
@@ -2039,7 +2040,6 @@ trowel
2039
2040
  truck
2040
2041
  trumpet
2041
2042
  trunk
2042
- t-shirt
2043
2043
  tsunami
2044
2044
  tub
2045
2045
  tuba
@@ -154,7 +154,7 @@ module Ebooks
154
154
  end
155
155
  else
156
156
  log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
157
- lines = content.split("\n")
157
+ lines = content.split("\n").reject { |l| l.start_with?('#') } # Remove commented lines
158
158
  end
159
159
 
160
160
  consume_lines(lines)
@@ -163,13 +163,12 @@ module Ebooks
163
163
  # Consume a sequence of lines
164
164
  # @param lines [Array<String>]
165
165
  def consume_lines(lines)
166
- log "Removing commented lines and sorting mentions"
166
+ log "Removing rts and sorting mentions"
167
167
 
168
168
  statements = []
169
169
  mentions = []
170
170
  lines.each do |l|
171
- next if l.start_with?('#') # Remove commented lines
172
- next if l.include?(': "RT @') || l.include?(': "MT @') # Remove soft retweets
171
+ next if l.start_with?('RT @') || l.start_with?('MT @') # Remove soft retweets
173
172
 
174
173
  if l.include?('@')
175
174
  mentions << NLP.normalize(l)
@@ -1,3 +1,3 @@
1
1
  module Ebooks
2
- VERSION = "3.3.0"
2
+ VERSION = "3.3.1"
3
3
  end
@@ -1,3 +1,8 @@
1
+ !
2
+ 'll
3
+ 've
4
+ .
5
+ ?
1
6
  a
2
7
  able
3
8
  about
@@ -8,6 +13,7 @@ according
8
13
  accordingly
9
14
  across
10
15
  act
16
+ actual
11
17
  actually
12
18
  added
13
19
  adj
@@ -18,6 +24,7 @@ after
18
24
  afterwards
19
25
  again
20
26
  against
27
+ ago
21
28
  ah
22
29
  all
23
30
  almost
@@ -47,6 +54,7 @@ apparently
47
54
  approximately
48
55
  are
49
56
  aren
57
+ aren't
50
58
  arent
51
59
  arise
52
60
  around
@@ -54,13 +62,17 @@ as
54
62
  aside
55
63
  ask
56
64
  asking
65
+ assume
57
66
  at
58
67
  auth
59
68
  available
60
69
  away
61
70
  awfully
71
+ aww
62
72
  b
63
73
  back
74
+ bad
75
+ basically
64
76
  be
65
77
  became
66
78
  because
@@ -80,35 +92,54 @@ believe
80
92
  below
81
93
  beside
82
94
  besides
95
+ best
96
+ better
83
97
  between
84
98
  beyond
99
+ big
85
100
  biol
101
+ bit
102
+ book
86
103
  both
87
104
  brief
88
105
  briefly
106
+ btw
89
107
  but
90
108
  by
91
109
  c
92
110
  ca
111
+ call
112
+ called
93
113
  came
94
114
  can
95
- cannot
96
115
  can't
116
+ cannot
117
+ case
97
118
  cause
98
119
  causes
99
120
  certain
100
121
  certainly
122
+ clearly
101
123
  co
102
124
  com
103
125
  come
104
126
  comes
127
+ completely
105
128
  contain
106
129
  containing
107
130
  contains
131
+ context
132
+ cool
108
133
  could
109
134
  couldnt
135
+ course
136
+ current
110
137
  d
138
+ damn
111
139
  date
140
+ day
141
+ dear
142
+ definitely
112
143
  did
113
144
  didn't
114
145
  different
@@ -116,8 +147,9 @@ do
116
147
  does
117
148
  doesn't
118
149
  doing
119
- done
150
+ don
120
151
  don't
152
+ done
121
153
  down
122
154
  downwards
123
155
  due
@@ -136,6 +168,8 @@ elsewhere
136
168
  end
137
169
  ending
138
170
  enough
171
+ entire
172
+ entirely
139
173
  especially
140
174
  et
141
175
  et-al
@@ -148,12 +182,18 @@ everyone
148
182
  everything
149
183
  everywhere
150
184
  ex
185
+ exactly
151
186
  except
152
187
  f
188
+ fact
153
189
  far
190
+ feel
154
191
  few
155
192
  ff
193
+ ffs
156
194
  fifth
195
+ finally
196
+ find
157
197
  first
158
198
  five
159
199
  fix
@@ -164,9 +204,13 @@ for
164
204
  former
165
205
  formerly
166
206
  forth
207
+ fortunately
167
208
  found
168
209
  four
210
+ free
169
211
  from
212
+ fuck
213
+ fun
170
214
  further
171
215
  furthermore
172
216
  g
@@ -179,21 +223,38 @@ given
179
223
  gives
180
224
  giving
181
225
  go
226
+ god
182
227
  goes
228
+ going
183
229
  gone
230
+ gonna
231
+ good
184
232
  got
185
233
  gotten
234
+ great
235
+ guess
236
+ guy
237
+ guys
186
238
  h
187
239
  had
240
+ hah
241
+ haha
242
+ hahaha
243
+ half
188
244
  happens
245
+ hard
189
246
  hardly
190
247
  has
191
248
  hasn't
249
+ hate
192
250
  have
193
251
  haven't
194
252
  having
195
253
  he
254
+ he's
255
+ heard
196
256
  hed
257
+ help
197
258
  hence
198
259
  her
199
260
  here
@@ -205,25 +266,42 @@ hereupon
205
266
  hers
206
267
  herself
207
268
  hes
269
+ hey
208
270
  hi
209
271
  hid
272
+ high
210
273
  him
211
274
  himself
212
275
  his
213
276
  hither
277
+ hmm
278
+ hmmm
279
+ holy
214
280
  home
281
+ hope
282
+ hopefully
283
+ house
215
284
  how
216
285
  howbeit
217
286
  however
287
+ http
288
+ https
289
+ hullo
218
290
  hundred
219
291
  i
292
+ i'd
293
+ i'll
294
+ i'm
295
+ i've
220
296
  id
297
+ idea
298
+ idk
221
299
  ie
222
300
  if
223
- i'll
224
301
  im
225
302
  immediate
226
303
  immediately
304
+ imo
227
305
  importance
228
306
  important
229
307
  in
@@ -232,17 +310,20 @@ indeed
232
310
  index
233
311
  information
234
312
  instead
313
+ interesting
314
+ internet
235
315
  into
236
316
  invention
237
317
  inward
238
318
  is
239
319
  isn't
240
320
  it
241
- itd
321
+ it'd
242
322
  it'll
323
+ it's
324
+ itd
243
325
  its
244
326
  itself
245
- i've
246
327
  j
247
328
  just
248
329
  k
@@ -250,6 +331,8 @@ keep
250
331
  keeps
251
332
  kept
252
333
  kg
334
+ kind
335
+ kinda
253
336
  km
254
337
  know
255
338
  known
@@ -265,22 +348,29 @@ least
265
348
  less
266
349
  lest
267
350
  let
351
+ let's
268
352
  lets
353
+ life
269
354
  like
270
355
  liked
271
356
  likely
272
357
  line
358
+ literally
273
359
  little
274
- 'll
360
+ lol
361
+ long
275
362
  look
276
363
  looking
277
364
  looks
365
+ lot
366
+ lots
278
367
  ltd
279
368
  m
280
369
  made
281
370
  mainly
282
371
  make
283
372
  makes
373
+ man
284
374
  many
285
375
  may
286
376
  maybe
@@ -320,9 +410,12 @@ need
320
410
  needs
321
411
  neither
322
412
  never
413
+ nevermind
323
414
  nevertheless
324
415
  new
416
+ news
325
417
  next
418
+ night
326
419
  nine
327
420
  ninety
328
421
  no
@@ -331,6 +424,7 @@ non
331
424
  none
332
425
  nonetheless
333
426
  noone
427
+ nope
334
428
  nor
335
429
  normally
336
430
  nos
@@ -350,6 +444,7 @@ oh
350
444
  ok
351
445
  okay
352
446
  old
447
+ omg
353
448
  omitted
354
449
  on
355
450
  once
@@ -357,6 +452,10 @@ one
357
452
  ones
358
453
  only
359
454
  onto
455
+ ooh
456
+ oooh
457
+ oops
458
+ open
360
459
  or
361
460
  ord
362
461
  other
@@ -379,11 +478,17 @@ part
379
478
  particular
380
479
  particularly
381
480
  past
481
+ people
382
482
  per
383
483
  perhaps
484
+ person
485
+ phone
486
+ place
384
487
  placed
488
+ play
385
489
  please
386
490
  plus
491
+ point
387
492
  poorly
388
493
  possible
389
494
  possibly
@@ -391,15 +496,18 @@ potentially
391
496
  pp
392
497
  predominantly
393
498
  present
499
+ pretty
394
500
  previously
395
501
  primarily
396
502
  probably
503
+ problem
397
504
  promptly
398
505
  proud
399
506
  provides
400
507
  put
401
508
  q
402
509
  que
510
+ question
403
511
  quickly
404
512
  quite
405
513
  qv
@@ -408,8 +516,11 @@ ran
408
516
  rather
409
517
  rd
410
518
  re
519
+ read
411
520
  readily
521
+ real
412
522
  really
523
+ reason
413
524
  recent
414
525
  recently
415
526
  ref
@@ -419,6 +530,7 @@ regardless
419
530
  regards
420
531
  related
421
532
  relatively
533
+ remember
422
534
  research
423
535
  respectively
424
536
  resulted
@@ -427,12 +539,14 @@ results
427
539
  right
428
540
  run
429
541
  s
542
+ sadly
430
543
  said
431
544
  same
432
545
  saw
433
546
  say
434
547
  saying
435
548
  says
549
+ screen
436
550
  sec
437
551
  section
438
552
  see
@@ -445,13 +559,17 @@ seen
445
559
  self
446
560
  selves
447
561
  sent
562
+ seriously
563
+ set
448
564
  seven
449
565
  several
450
566
  shall
451
567
  she
452
- shed
453
568
  she'll
569
+ she's
570
+ shed
454
571
  shes
572
+ shit
455
573
  should
456
574
  shouldn't
457
575
  show
@@ -479,13 +597,17 @@ somewhat
479
597
  somewhere
480
598
  soon
481
599
  sorry
600
+ sort
601
+ sounds
482
602
  specifically
483
603
  specified
484
604
  specify
485
605
  specifying
606
+ start
486
607
  still
487
608
  stop
488
609
  strongly
610
+ stuff
489
611
  sub
490
612
  substantially
491
613
  successfully
@@ -494,11 +616,16 @@ sufficiently
494
616
  suggest
495
617
  sup
496
618
  sure
619
+ surely
620
+ sweet
497
621
  t
498
622
  take
499
623
  taken
500
624
  taking
625
+ talk
626
+ talking
501
627
  tell
628
+ tend
502
629
  tends
503
630
  th
504
631
  than
@@ -506,9 +633,11 @@ thank
506
633
  thanks
507
634
  thanx
508
635
  that
636
+ that'd
509
637
  that'll
510
- thats
638
+ that's
511
639
  that've
640
+ thats
512
641
  the
513
642
  their
514
643
  theirs
@@ -517,55 +646,74 @@ themselves
517
646
  then
518
647
  thence
519
648
  there
649
+ there'll
650
+ there's
651
+ there've
520
652
  thereafter
521
653
  thereby
522
654
  thered
523
655
  therefore
524
656
  therein
525
- there'll
526
657
  thereof
527
658
  therere
528
659
  theres
529
660
  thereto
530
661
  thereupon
531
- there've
532
662
  these
533
663
  they
534
- theyd
535
664
  they'll
536
- theyre
665
+ they're
537
666
  they've
667
+ theyd
668
+ theyre
669
+ thing
670
+ things
538
671
  think
539
672
  this
673
+ tho
540
674
  those
541
675
  thou
542
676
  though
543
677
  thoughh
678
+ thought
544
679
  thousand
680
+ three
545
681
  throug
546
682
  through
547
683
  throughout
548
684
  thru
549
685
  thus
550
686
  til
687
+ time
688
+ times
551
689
  tip
552
690
  to
691
+ today
553
692
  together
693
+ told
694
+ tomorrow
695
+ tonight
554
696
  too
555
697
  took
698
+ totally
556
699
  toward
557
700
  towards
558
701
  tried
559
702
  tries
703
+ true
560
704
  truly
561
705
  try
562
706
  trying
563
707
  ts
708
+ tweet
564
709
  twice
710
+ twitter
565
711
  two
566
712
  u
713
+ ugh
567
714
  un
568
715
  under
716
+ understand
569
717
  unfortunately
570
718
  unless
571
719
  unlike
@@ -587,30 +735,37 @@ usually
587
735
  v
588
736
  value
589
737
  various
590
- 've
591
738
  very
592
739
  via
740
+ video
593
741
  viz
594
742
  vol
595
743
  vols
596
744
  vs
597
745
  w
746
+ wait
747
+ wanna
598
748
  want
599
749
  wants
600
750
  was
601
751
  wasn't
602
752
  way
603
753
  we
754
+ we'll
755
+ we're
756
+ we've
757
+ wear
604
758
  wed
759
+ weird
605
760
  welcome
606
- we'll
761
+ well
607
762
  went
608
763
  were
609
764
  weren't
610
- we've
611
765
  what
612
- whatever
613
766
  what'll
767
+ what's
768
+ whatever
614
769
  whats
615
770
  when
616
771
  whence
@@ -629,215 +784,57 @@ while
629
784
  whim
630
785
  whither
631
786
  who
787
+ who'll
788
+ whoa
632
789
  whod
633
790
  whoever
634
791
  whole
635
- who'll
636
792
  whom
637
793
  whomever
794
+ whoops
638
795
  whos
639
796
  whose
640
797
  why
641
798
  widely
799
+ will
642
800
  willing
643
801
  wish
644
802
  with
645
803
  within
646
804
  without
805
+ woah
647
806
  won't
807
+ word
648
808
  words
809
+ work
810
+ working
811
+ works
649
812
  world
650
813
  would
651
814
  wouldn't
815
+ wow
816
+ wrong
652
817
  www
653
818
  x
654
819
  y
820
+ y'know
821
+ yeah
822
+ year
823
+ years
824
+ yep
655
825
  yes
656
826
  yet
657
827
  you
658
- youd
828
+ you'd
659
829
  you'll
830
+ you're
831
+ you've
832
+ youd
660
833
  your
661
834
  youre
662
835
  yours
663
836
  yourself
664
837
  yourselves
665
- you've
838
+ yup
666
839
  z
667
840
  zero
668
- .
669
- ?
670
- !
671
-
672
- http
673
- don
674
- people
675
- well
676
- will
677
- https
678
- time
679
- good
680
- thing
681
- twitter
682
- pretty
683
- it's
684
- i'm
685
- that's
686
- you're
687
- they're
688
- there's
689
- things
690
- yeah
691
- find
692
- going
693
- work
694
- point
695
- years
696
- guess
697
- bad
698
- problem
699
- real
700
- kind
701
- day
702
- better
703
- lot
704
- stuff
705
- i'd
706
- read
707
- thought
708
- idea
709
- case
710
- word
711
- hey
712
- person
713
- long
714
- Dear
715
- internet
716
- tweet
717
- he's
718
- feel
719
- wrong
720
- call
721
- hard
722
- phone
723
- ago
724
- literally
725
- remember
726
- reason
727
- called
728
- course
729
- bit
730
- question
731
- high
732
- today
733
- told
734
- man
735
- actual
736
- year
737
- three
738
- book
739
- assume
740
- life
741
- true
742
- best
743
- wow
744
- video
745
- times
746
- works
747
- fact
748
- completely
749
- totally
750
- imo
751
- open
752
- lol
753
- haha
754
- cool
755
- yep
756
- ooh
757
- great
758
- ugh
759
- tonight
760
- talk
761
- sounds
762
- hahaha
763
- whoa
764
- cool
765
- we're
766
- guys
767
- sweet
768
- fortunately
769
- hmm
770
- aren't
771
- sadly
772
- talking
773
- you'd
774
- place
775
- yup
776
- what's
777
- y'know
778
- basically
779
- god
780
- shit
781
- holy
782
- interesting
783
- news
784
- guy
785
- wait
786
- oooh
787
- gonna
788
- current
789
- let's
790
- tomorrow
791
- omg
792
- hate
793
- hope
794
- fuck
795
- oops
796
- night
797
- wear
798
- wanna
799
- fun
800
- finally
801
- whoops
802
- nevermind
803
- definitely
804
- context
805
- screen
806
- free
807
- exactly
808
- big
809
- house
810
- half
811
- working
812
- play
813
- heard
814
- hmmm
815
- damn
816
- woah
817
- tho
818
- set
819
- idk
820
- sort
821
- understand
822
- kinda
823
- seriously
824
- btw
825
- she's
826
- hah
827
- aww
828
- ffs
829
- it'd
830
- that'd
831
- hopefully
832
- non
833
- entirely
834
- lots
835
- entire
836
- tend
837
- hullo
838
- clearly
839
- surely
840
- weird
841
- start
842
- help
843
- nope