PgsFile 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PgsFile might be problematic. Click here for more details.
- PgsFile/Corpora/Stopwords/english.txt +97 -91
- PgsFile/PgsFile.py +57 -1
- PgsFile/__init__.py +1 -1
- PgsFile/models/dics/unigram_freq_only.json +1 -0
- {PgsFile-0.3.8.dist-info → PgsFile-0.3.9.dist-info}/METADATA +2 -2
- {PgsFile-0.3.8.dist-info → PgsFile-0.3.9.dist-info}/RECORD +9 -8
- {PgsFile-0.3.8.dist-info → PgsFile-0.3.9.dist-info}/LICENSE +0 -0
- {PgsFile-0.3.8.dist-info → PgsFile-0.3.9.dist-info}/WHEEL +0 -0
- {PgsFile-0.3.8.dist-info → PgsFile-0.3.9.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,10 @@
|
|
|
1
|
+
'd
|
|
1
2
|
'll
|
|
2
|
-
'
|
|
3
|
-
'
|
|
3
|
+
'm
|
|
4
|
+
're
|
|
5
|
+
's
|
|
4
6
|
've
|
|
5
|
-
10
|
|
6
|
-
39
|
|
7
7
|
a
|
|
8
|
-
a's
|
|
9
8
|
able
|
|
10
9
|
ableabout
|
|
11
10
|
about
|
|
@@ -36,8 +35,8 @@ ago
|
|
|
36
35
|
ah
|
|
37
36
|
ahead
|
|
38
37
|
ai
|
|
39
|
-
ain't
|
|
40
38
|
aint
|
|
39
|
+
ain't
|
|
41
40
|
al
|
|
42
41
|
all
|
|
43
42
|
allow
|
|
@@ -83,12 +82,13 @@ are
|
|
|
83
82
|
area
|
|
84
83
|
areas
|
|
85
84
|
aren
|
|
86
|
-
aren't
|
|
87
85
|
arent
|
|
86
|
+
aren't
|
|
88
87
|
arise
|
|
89
88
|
around
|
|
90
89
|
arpa
|
|
91
90
|
as
|
|
91
|
+
a's
|
|
92
92
|
aside
|
|
93
93
|
ask
|
|
94
94
|
asked
|
|
@@ -164,15 +164,13 @@ bw
|
|
|
164
164
|
by
|
|
165
165
|
bz
|
|
166
166
|
c
|
|
167
|
-
c'mon
|
|
168
|
-
c's
|
|
169
167
|
ca
|
|
170
168
|
call
|
|
171
169
|
came
|
|
172
170
|
can
|
|
173
|
-
can't
|
|
174
171
|
cannot
|
|
175
172
|
cant
|
|
173
|
+
can't
|
|
176
174
|
caption
|
|
177
175
|
case
|
|
178
176
|
cases
|
|
@@ -194,6 +192,7 @@ clearly
|
|
|
194
192
|
click
|
|
195
193
|
cm
|
|
196
194
|
cmon
|
|
195
|
+
c'mon
|
|
197
196
|
cn
|
|
198
197
|
co
|
|
199
198
|
co.
|
|
@@ -212,14 +211,15 @@ contains
|
|
|
212
211
|
copy
|
|
213
212
|
corresponding
|
|
214
213
|
could
|
|
215
|
-
could've
|
|
216
214
|
couldn
|
|
217
|
-
couldn't
|
|
218
215
|
couldnt
|
|
216
|
+
couldn't
|
|
217
|
+
could've
|
|
219
218
|
course
|
|
220
219
|
cr
|
|
221
220
|
cry
|
|
222
221
|
cs
|
|
222
|
+
c's
|
|
223
223
|
cu
|
|
224
224
|
currently
|
|
225
225
|
cv
|
|
@@ -227,9 +227,10 @@ cx
|
|
|
227
227
|
cy
|
|
228
228
|
cz
|
|
229
229
|
d
|
|
230
|
+
d
|
|
230
231
|
dare
|
|
231
|
-
daren't
|
|
232
232
|
darent
|
|
233
|
+
daren't
|
|
233
234
|
date
|
|
234
235
|
de
|
|
235
236
|
dear
|
|
@@ -240,8 +241,8 @@ despite
|
|
|
240
241
|
detail
|
|
241
242
|
did
|
|
242
243
|
didn
|
|
243
|
-
didn't
|
|
244
244
|
didnt
|
|
245
|
+
didn't
|
|
245
246
|
differ
|
|
246
247
|
different
|
|
247
248
|
differently
|
|
@@ -252,13 +253,13 @@ dm
|
|
|
252
253
|
do
|
|
253
254
|
does
|
|
254
255
|
doesn
|
|
255
|
-
doesn't
|
|
256
256
|
doesnt
|
|
257
|
+
doesn't
|
|
257
258
|
doing
|
|
258
259
|
don
|
|
259
|
-
don't
|
|
260
260
|
done
|
|
261
261
|
dont
|
|
262
|
+
don't
|
|
262
263
|
doubtful
|
|
263
264
|
down
|
|
264
265
|
downed
|
|
@@ -412,41 +413,40 @@ gw
|
|
|
412
413
|
gy
|
|
413
414
|
h
|
|
414
415
|
had
|
|
415
|
-
hadn't
|
|
416
416
|
hadnt
|
|
417
|
+
hadn't
|
|
417
418
|
half
|
|
418
419
|
happens
|
|
419
420
|
hardly
|
|
420
421
|
has
|
|
421
422
|
hasn
|
|
422
|
-
hasn't
|
|
423
423
|
hasnt
|
|
424
|
+
hasn't
|
|
424
425
|
have
|
|
425
426
|
haven
|
|
426
|
-
haven't
|
|
427
427
|
havent
|
|
428
|
+
haven't
|
|
428
429
|
having
|
|
429
430
|
he
|
|
430
|
-
he'd
|
|
431
|
-
he'll
|
|
432
|
-
he's
|
|
433
431
|
hed
|
|
432
|
+
he'd
|
|
434
433
|
hell
|
|
434
|
+
he'll
|
|
435
435
|
hello
|
|
436
436
|
help
|
|
437
437
|
hence
|
|
438
438
|
her
|
|
439
439
|
here
|
|
440
|
-
here's
|
|
441
440
|
hereafter
|
|
442
441
|
hereby
|
|
443
442
|
herein
|
|
444
443
|
heres
|
|
444
|
+
here's
|
|
445
445
|
hereupon
|
|
446
446
|
hers
|
|
447
447
|
herself
|
|
448
|
-
herse”
|
|
449
448
|
hes
|
|
449
|
+
he's
|
|
450
450
|
hi
|
|
451
451
|
hid
|
|
452
452
|
high
|
|
@@ -454,7 +454,6 @@ higher
|
|
|
454
454
|
highest
|
|
455
455
|
him
|
|
456
456
|
himself
|
|
457
|
-
himse”
|
|
458
457
|
his
|
|
459
458
|
hither
|
|
460
459
|
hk
|
|
@@ -464,11 +463,11 @@ home
|
|
|
464
463
|
homepage
|
|
465
464
|
hopefully
|
|
466
465
|
how
|
|
466
|
+
howbeit
|
|
467
467
|
how'd
|
|
468
|
+
however
|
|
468
469
|
how'll
|
|
469
470
|
how's
|
|
470
|
-
howbeit
|
|
471
|
-
however
|
|
472
471
|
hr
|
|
473
472
|
ht
|
|
474
473
|
htm
|
|
@@ -477,19 +476,18 @@ http
|
|
|
477
476
|
hu
|
|
478
477
|
hundred
|
|
479
478
|
i
|
|
480
|
-
i'd
|
|
481
|
-
i'll
|
|
482
|
-
i'm
|
|
483
|
-
i've
|
|
484
479
|
i.e.
|
|
485
480
|
id
|
|
481
|
+
i'd
|
|
486
482
|
ie
|
|
487
483
|
if
|
|
488
484
|
ignored
|
|
489
485
|
ii
|
|
490
486
|
il
|
|
491
487
|
ill
|
|
488
|
+
i'll
|
|
492
489
|
im
|
|
490
|
+
i'm
|
|
493
491
|
immediate
|
|
494
492
|
immediately
|
|
495
493
|
importance
|
|
@@ -521,18 +519,18 @@ iq
|
|
|
521
519
|
ir
|
|
522
520
|
is
|
|
523
521
|
isn
|
|
524
|
-
isn't
|
|
525
522
|
isnt
|
|
523
|
+
isn't
|
|
526
524
|
it
|
|
527
|
-
it'd
|
|
528
|
-
it'll
|
|
529
|
-
it's
|
|
530
525
|
itd
|
|
526
|
+
it'd
|
|
531
527
|
itll
|
|
528
|
+
it'll
|
|
532
529
|
its
|
|
530
|
+
it's
|
|
533
531
|
itself
|
|
534
|
-
itse”
|
|
535
532
|
ive
|
|
533
|
+
i've
|
|
536
534
|
j
|
|
537
535
|
je
|
|
538
536
|
jm
|
|
@@ -578,8 +576,8 @@ length
|
|
|
578
576
|
less
|
|
579
577
|
lest
|
|
580
578
|
let
|
|
581
|
-
let's
|
|
582
579
|
lets
|
|
580
|
+
let's
|
|
583
581
|
li
|
|
584
582
|
like
|
|
585
583
|
liked
|
|
@@ -589,6 +587,7 @@ line
|
|
|
589
587
|
little
|
|
590
588
|
lk
|
|
591
589
|
ll
|
|
590
|
+
ll
|
|
592
591
|
long
|
|
593
592
|
longer
|
|
594
593
|
longest
|
|
@@ -605,6 +604,7 @@ lu
|
|
|
605
604
|
lv
|
|
606
605
|
ly
|
|
607
606
|
m
|
|
607
|
+
m
|
|
608
608
|
ma
|
|
609
609
|
made
|
|
610
610
|
mainly
|
|
@@ -615,8 +615,8 @@ man
|
|
|
615
615
|
many
|
|
616
616
|
may
|
|
617
617
|
maybe
|
|
618
|
-
mayn't
|
|
619
618
|
maynt
|
|
619
|
+
mayn't
|
|
620
620
|
mc
|
|
621
621
|
md
|
|
622
622
|
me
|
|
@@ -632,9 +632,9 @@ mg
|
|
|
632
632
|
mh
|
|
633
633
|
microsoft
|
|
634
634
|
might
|
|
635
|
-
might've
|
|
636
|
-
mightn't
|
|
637
635
|
mightnt
|
|
636
|
+
mightn't
|
|
637
|
+
might've
|
|
638
638
|
mil
|
|
639
639
|
mill
|
|
640
640
|
million
|
|
@@ -662,17 +662,17 @@ mu
|
|
|
662
662
|
much
|
|
663
663
|
mug
|
|
664
664
|
must
|
|
665
|
-
must've
|
|
666
|
-
mustn't
|
|
667
665
|
mustnt
|
|
666
|
+
mustn't
|
|
667
|
+
must've
|
|
668
668
|
mv
|
|
669
669
|
mw
|
|
670
670
|
mx
|
|
671
671
|
my
|
|
672
672
|
myself
|
|
673
|
-
myse”
|
|
674
673
|
mz
|
|
675
674
|
n
|
|
675
|
+
n't
|
|
676
676
|
na
|
|
677
677
|
name
|
|
678
678
|
namely
|
|
@@ -687,8 +687,8 @@ necessary
|
|
|
687
687
|
need
|
|
688
688
|
needed
|
|
689
689
|
needing
|
|
690
|
-
needn't
|
|
691
690
|
neednt
|
|
691
|
+
needn't
|
|
692
692
|
needs
|
|
693
693
|
neither
|
|
694
694
|
net
|
|
@@ -708,12 +708,12 @@ nine
|
|
|
708
708
|
ninety
|
|
709
709
|
nl
|
|
710
710
|
no
|
|
711
|
-
no-one
|
|
712
711
|
nobody
|
|
713
712
|
non
|
|
714
713
|
none
|
|
715
714
|
nonetheless
|
|
716
715
|
noone
|
|
716
|
+
no-one
|
|
717
717
|
nor
|
|
718
718
|
normally
|
|
719
719
|
nos
|
|
@@ -726,6 +726,7 @@ now
|
|
|
726
726
|
nowhere
|
|
727
727
|
np
|
|
728
728
|
nr
|
|
729
|
+
n't
|
|
729
730
|
nu
|
|
730
731
|
null
|
|
731
732
|
number
|
|
@@ -749,8 +750,8 @@ omitted
|
|
|
749
750
|
on
|
|
750
751
|
once
|
|
751
752
|
one
|
|
752
|
-
one's
|
|
753
753
|
ones
|
|
754
|
+
one's
|
|
754
755
|
only
|
|
755
756
|
onto
|
|
756
757
|
open
|
|
@@ -769,8 +770,8 @@ other
|
|
|
769
770
|
others
|
|
770
771
|
otherwise
|
|
771
772
|
ought
|
|
772
|
-
oughtn't
|
|
773
773
|
oughtnt
|
|
774
|
+
oughtn't
|
|
774
775
|
our
|
|
775
776
|
ours
|
|
776
777
|
ourselves
|
|
@@ -848,6 +849,7 @@ ran
|
|
|
848
849
|
rather
|
|
849
850
|
rd
|
|
850
851
|
re
|
|
852
|
+
re
|
|
851
853
|
readily
|
|
852
854
|
really
|
|
853
855
|
reasonably
|
|
@@ -876,6 +878,7 @@ ru
|
|
|
876
878
|
run
|
|
877
879
|
rw
|
|
878
880
|
s
|
|
881
|
+
s
|
|
879
882
|
sa
|
|
880
883
|
said
|
|
881
884
|
same
|
|
@@ -912,20 +915,20 @@ several
|
|
|
912
915
|
sg
|
|
913
916
|
sh
|
|
914
917
|
shall
|
|
915
|
-
shan't
|
|
916
918
|
shant
|
|
919
|
+
shan't
|
|
917
920
|
she
|
|
918
|
-
she'd
|
|
919
|
-
she'll
|
|
920
|
-
she's
|
|
921
921
|
shed
|
|
922
|
+
she'd
|
|
922
923
|
shell
|
|
924
|
+
she'll
|
|
923
925
|
shes
|
|
926
|
+
she's
|
|
924
927
|
should
|
|
925
|
-
should've
|
|
926
928
|
shouldn
|
|
927
|
-
shouldn't
|
|
928
929
|
shouldnt
|
|
930
|
+
shouldn't
|
|
931
|
+
should've
|
|
929
932
|
show
|
|
930
933
|
showed
|
|
931
934
|
showing
|
|
@@ -992,7 +995,6 @@ sy
|
|
|
992
995
|
system
|
|
993
996
|
sz
|
|
994
997
|
t
|
|
995
|
-
t's
|
|
996
998
|
take
|
|
997
999
|
taken
|
|
998
1000
|
taking
|
|
@@ -1011,12 +1013,12 @@ thank
|
|
|
1011
1013
|
thanks
|
|
1012
1014
|
thanx
|
|
1013
1015
|
that
|
|
1014
|
-
that'll
|
|
1015
|
-
that's
|
|
1016
|
-
that've
|
|
1017
1016
|
thatll
|
|
1017
|
+
that'll
|
|
1018
1018
|
thats
|
|
1019
|
+
that's
|
|
1019
1020
|
thatve
|
|
1021
|
+
that've
|
|
1020
1022
|
the
|
|
1021
1023
|
their
|
|
1022
1024
|
theirs
|
|
@@ -1025,33 +1027,33 @@ themselves
|
|
|
1025
1027
|
then
|
|
1026
1028
|
thence
|
|
1027
1029
|
there
|
|
1028
|
-
there'd
|
|
1029
|
-
there'll
|
|
1030
|
-
there're
|
|
1031
|
-
there's
|
|
1032
|
-
there've
|
|
1033
1030
|
thereafter
|
|
1034
1031
|
thereby
|
|
1035
1032
|
thered
|
|
1033
|
+
there'd
|
|
1036
1034
|
therefore
|
|
1037
1035
|
therein
|
|
1038
1036
|
therell
|
|
1037
|
+
there'll
|
|
1039
1038
|
thereof
|
|
1040
1039
|
therere
|
|
1040
|
+
there're
|
|
1041
1041
|
theres
|
|
1042
|
+
there's
|
|
1042
1043
|
thereto
|
|
1043
1044
|
thereupon
|
|
1044
1045
|
thereve
|
|
1046
|
+
there've
|
|
1045
1047
|
these
|
|
1046
1048
|
they
|
|
1047
|
-
they'd
|
|
1048
|
-
they'll
|
|
1049
|
-
they're
|
|
1050
|
-
they've
|
|
1051
1049
|
theyd
|
|
1050
|
+
they'd
|
|
1052
1051
|
theyll
|
|
1052
|
+
they'll
|
|
1053
1053
|
theyre
|
|
1054
|
+
they're
|
|
1054
1055
|
theyve
|
|
1056
|
+
they've
|
|
1055
1057
|
thick
|
|
1056
1058
|
thin
|
|
1057
1059
|
thing
|
|
@@ -1080,6 +1082,7 @@ til
|
|
|
1080
1082
|
till
|
|
1081
1083
|
tip
|
|
1082
1084
|
tis
|
|
1085
|
+
tis
|
|
1083
1086
|
tj
|
|
1084
1087
|
tk
|
|
1085
1088
|
tm
|
|
@@ -1101,6 +1104,7 @@ truly
|
|
|
1101
1104
|
try
|
|
1102
1105
|
trying
|
|
1103
1106
|
ts
|
|
1107
|
+
t's
|
|
1104
1108
|
tt
|
|
1105
1109
|
turn
|
|
1106
1110
|
turned
|
|
@@ -1109,6 +1113,7 @@ turns
|
|
|
1109
1113
|
tv
|
|
1110
1114
|
tw
|
|
1111
1115
|
twas
|
|
1116
|
+
twas
|
|
1112
1117
|
twelve
|
|
1113
1118
|
twenty
|
|
1114
1119
|
twice
|
|
@@ -1151,6 +1156,7 @@ value
|
|
|
1151
1156
|
various
|
|
1152
1157
|
vc
|
|
1153
1158
|
ve
|
|
1159
|
+
ve
|
|
1154
1160
|
versus
|
|
1155
1161
|
very
|
|
1156
1162
|
vg
|
|
@@ -1169,53 +1175,53 @@ wanting
|
|
|
1169
1175
|
wants
|
|
1170
1176
|
was
|
|
1171
1177
|
wasn
|
|
1172
|
-
wasn't
|
|
1173
1178
|
wasnt
|
|
1179
|
+
wasn't
|
|
1174
1180
|
way
|
|
1175
1181
|
ways
|
|
1176
1182
|
we
|
|
1177
|
-
we'd
|
|
1178
|
-
we'll
|
|
1179
|
-
we're
|
|
1180
|
-
we've
|
|
1181
1183
|
web
|
|
1182
1184
|
webpage
|
|
1183
1185
|
website
|
|
1184
1186
|
wed
|
|
1187
|
+
we'd
|
|
1185
1188
|
welcome
|
|
1186
1189
|
well
|
|
1190
|
+
we'll
|
|
1187
1191
|
wells
|
|
1188
1192
|
went
|
|
1189
1193
|
were
|
|
1194
|
+
we're
|
|
1190
1195
|
weren
|
|
1191
|
-
weren't
|
|
1192
1196
|
werent
|
|
1197
|
+
weren't
|
|
1193
1198
|
weve
|
|
1199
|
+
we've
|
|
1194
1200
|
wf
|
|
1195
1201
|
what
|
|
1196
1202
|
what'd
|
|
1197
|
-
what'll
|
|
1198
|
-
what's
|
|
1199
|
-
what've
|
|
1200
1203
|
whatever
|
|
1201
1204
|
whatll
|
|
1205
|
+
what'll
|
|
1202
1206
|
whats
|
|
1207
|
+
what's
|
|
1203
1208
|
whatve
|
|
1209
|
+
what've
|
|
1204
1210
|
when
|
|
1211
|
+
whence
|
|
1205
1212
|
when'd
|
|
1213
|
+
whenever
|
|
1206
1214
|
when'll
|
|
1207
1215
|
when's
|
|
1208
|
-
whence
|
|
1209
|
-
whenever
|
|
1210
1216
|
where
|
|
1211
|
-
where'd
|
|
1212
|
-
where'll
|
|
1213
|
-
where's
|
|
1214
1217
|
whereafter
|
|
1215
1218
|
whereas
|
|
1216
1219
|
whereby
|
|
1220
|
+
where'd
|
|
1217
1221
|
wherein
|
|
1222
|
+
where'll
|
|
1218
1223
|
wheres
|
|
1224
|
+
where's
|
|
1219
1225
|
whereupon
|
|
1220
1226
|
wherever
|
|
1221
1227
|
whether
|
|
@@ -1226,16 +1232,16 @@ whilst
|
|
|
1226
1232
|
whim
|
|
1227
1233
|
whither
|
|
1228
1234
|
who
|
|
1229
|
-
who'd
|
|
1230
|
-
who'll
|
|
1231
|
-
who's
|
|
1232
1235
|
whod
|
|
1236
|
+
who'd
|
|
1233
1237
|
whoever
|
|
1234
1238
|
whole
|
|
1235
1239
|
wholl
|
|
1240
|
+
who'll
|
|
1236
1241
|
whom
|
|
1237
1242
|
whomever
|
|
1238
1243
|
whos
|
|
1244
|
+
who's
|
|
1239
1245
|
whose
|
|
1240
1246
|
why
|
|
1241
1247
|
why'd
|
|
@@ -1250,9 +1256,9 @@ with
|
|
|
1250
1256
|
within
|
|
1251
1257
|
without
|
|
1252
1258
|
won
|
|
1253
|
-
won't
|
|
1254
1259
|
wonder
|
|
1255
1260
|
wont
|
|
1261
|
+
won't
|
|
1256
1262
|
words
|
|
1257
1263
|
work
|
|
1258
1264
|
worked
|
|
@@ -1260,10 +1266,10 @@ working
|
|
|
1260
1266
|
works
|
|
1261
1267
|
world
|
|
1262
1268
|
would
|
|
1263
|
-
would've
|
|
1264
1269
|
wouldn
|
|
1265
|
-
wouldn't
|
|
1266
1270
|
wouldnt
|
|
1271
|
+
wouldn't
|
|
1272
|
+
would've
|
|
1267
1273
|
ws
|
|
1268
1274
|
www
|
|
1269
1275
|
x
|
|
@@ -1274,25 +1280,25 @@ years
|
|
|
1274
1280
|
yes
|
|
1275
1281
|
yet
|
|
1276
1282
|
you
|
|
1277
|
-
you'd
|
|
1278
|
-
you'll
|
|
1279
|
-
you're
|
|
1280
|
-
you've
|
|
1281
1283
|
youd
|
|
1284
|
+
you'd
|
|
1282
1285
|
youll
|
|
1286
|
+
you'll
|
|
1283
1287
|
young
|
|
1284
1288
|
younger
|
|
1285
1289
|
youngest
|
|
1286
1290
|
your
|
|
1287
1291
|
youre
|
|
1292
|
+
you're
|
|
1288
1293
|
yours
|
|
1289
1294
|
yourself
|
|
1290
1295
|
yourselves
|
|
1291
1296
|
youve
|
|
1297
|
+
you've
|
|
1292
1298
|
yt
|
|
1293
1299
|
yu
|
|
1294
1300
|
z
|
|
1295
1301
|
za
|
|
1296
1302
|
zero
|
|
1297
1303
|
zm
|
|
1298
|
-
zr
|
|
1304
|
+
zr
|
PgsFile/PgsFile.py
CHANGED
|
@@ -3795,4 +3795,60 @@ def perform_liwc_zh(dic_path, file_path, output_excel_path):
|
|
|
3795
3795
|
|
|
3796
3796
|
import pandas as pd
|
|
3797
3797
|
df = pd.DataFrame(data,columns=[u'类别', u'出现词种数', u'占词表百分比', u'出现词次', u'总词次', u'覆盖率', u'例词'])
|
|
3798
|
-
df.to_excel(output_excel_path,'sheet1',index=False)
|
|
3798
|
+
df.to_excel(output_excel_path,'sheet1',index=False)
|
|
3799
|
+
|
|
3800
|
+
|
|
3801
|
+
import math
|
|
3802
|
+
from collections import defaultdict
|
|
3803
|
+
def calculate_log_likelihood(target_count, reference_count, total_target, total_reference):
|
|
3804
|
+
"""Calculate the log-likelihood of a word being a keyword using absolute frequencies."""
|
|
3805
|
+
# Calculate expected frequencies
|
|
3806
|
+
total_combined = total_target + total_reference
|
|
3807
|
+
expected_target = (target_count + reference_count) * (total_target / total_combined)
|
|
3808
|
+
expected_reference = (target_count + reference_count) * (total_reference / total_combined)
|
|
3809
|
+
|
|
3810
|
+
# Calculate log-likelihood
|
|
3811
|
+
ll = 0.0
|
|
3812
|
+
if target_count > 0:
|
|
3813
|
+
ll += target_count * math.log(target_count / expected_target)
|
|
3814
|
+
if reference_count > 0:
|
|
3815
|
+
ll += reference_count * math.log(reference_count / expected_reference)
|
|
3816
|
+
|
|
3817
|
+
return ll * 2 # Return G^2 statistic
|
|
3818
|
+
|
|
3819
|
+
def extract_keywords_en(target_text, top_n=10):
|
|
3820
|
+
"""Extract keywords from target text using log-likelihood with absolute reference frequencies."""
|
|
3821
|
+
# Example usage
|
|
3822
|
+
my_dic_path = get_library_location("PgsFile")+"/PgsFile/models/dics/unigram_freq_only.json"
|
|
3823
|
+
reference_freq = get_data_json(my_dic_path)
|
|
3824
|
+
# Tokenize target text and preserve original case
|
|
3825
|
+
original_words = word_tokenize2(target_text)
|
|
3826
|
+
lower_words = [w.lower() for w in original_words if w.lower() not in BigPunctuation and w.lower() not in get_stopwords()]
|
|
3827
|
+
total_target = len(lower_words)
|
|
3828
|
+
|
|
3829
|
+
# Calculate target word frequencies
|
|
3830
|
+
target_word_freq = defaultdict(int)
|
|
3831
|
+
word_case_mapping = {}
|
|
3832
|
+
for orig_word, lower_word in zip(original_words, [w.lower() for w in original_words]):
|
|
3833
|
+
if lower_word in lower_words:
|
|
3834
|
+
target_word_freq[lower_word] += 1
|
|
3835
|
+
if lower_word not in word_case_mapping:
|
|
3836
|
+
word_case_mapping[lower_word] = orig_word
|
|
3837
|
+
|
|
3838
|
+
# Calculate total reference frequency
|
|
3839
|
+
total_reference = sum(reference_freq.values())
|
|
3840
|
+
|
|
3841
|
+
# Calculate log-likelihood for each word
|
|
3842
|
+
keyword_scores = []
|
|
3843
|
+
for word, target_count in target_word_freq.items():
|
|
3844
|
+
reference_count = reference_freq.get(word, 0)
|
|
3845
|
+
ll = calculate_log_likelihood(target_count, reference_count, total_target, total_reference)
|
|
3846
|
+
relative_freq = target_count / total_target
|
|
3847
|
+
original_word = word_case_mapping.get(word, word)
|
|
3848
|
+
keyword_scores.append((original_word, target_count, relative_freq, ll))
|
|
3849
|
+
|
|
3850
|
+
# Sort keywords by log-likelihood score
|
|
3851
|
+
keyword_scores.sort(key=lambda x: x[3], reverse=True)
|
|
3852
|
+
|
|
3853
|
+
# Return top N keywords
|
|
3854
|
+
return keyword_scores[:top_n]
|
PgsFile/__init__.py
CHANGED
|
@@ -49,7 +49,7 @@ from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity,
|
|
|
49
49
|
from .PgsFile import word_list, batch_word_list
|
|
50
50
|
from .PgsFile import cs, cs1, sent_tokenize, word_tokenize, word_tokenize2
|
|
51
51
|
from .PgsFile import word_lemmatize, word_POS, word_NER
|
|
52
|
-
from .PgsFile import extract_noun_phrases, get_LLMs_prompt
|
|
52
|
+
from .PgsFile import extract_noun_phrases, get_LLMs_prompt, extract_keywords_en
|
|
53
53
|
from .PgsFile import extract_dependency_relations, extract_dependency_relations_full
|
|
54
54
|
from .PgsFile import predict_category
|
|
55
55
|
|