PgsFile 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PgsFile might be problematic. Click here for more details.

@@ -1,11 +1,10 @@
1
+ 'd
1
2
  'll
2
- 'tis
3
- 'twas
3
+ 'm
4
+ 're
5
+ 's
4
6
  've
5
- 10
6
- 39
7
7
  a
8
- a's
9
8
  able
10
9
  ableabout
11
10
  about
@@ -36,8 +35,8 @@ ago
36
35
  ah
37
36
  ahead
38
37
  ai
39
- ain't
40
38
  aint
39
+ ain't
41
40
  al
42
41
  all
43
42
  allow
@@ -83,12 +82,13 @@ are
83
82
  area
84
83
  areas
85
84
  aren
86
- aren't
87
85
  arent
86
+ aren't
88
87
  arise
89
88
  around
90
89
  arpa
91
90
  as
91
+ a's
92
92
  aside
93
93
  ask
94
94
  asked
@@ -164,15 +164,13 @@ bw
164
164
  by
165
165
  bz
166
166
  c
167
- c'mon
168
- c's
169
167
  ca
170
168
  call
171
169
  came
172
170
  can
173
- can't
174
171
  cannot
175
172
  cant
173
+ can't
176
174
  caption
177
175
  case
178
176
  cases
@@ -194,6 +192,7 @@ clearly
194
192
  click
195
193
  cm
196
194
  cmon
195
+ c'mon
197
196
  cn
198
197
  co
199
198
  co.
@@ -212,14 +211,15 @@ contains
212
211
  copy
213
212
  corresponding
214
213
  could
215
- could've
216
214
  couldn
217
- couldn't
218
215
  couldnt
216
+ couldn't
217
+ could've
219
218
  course
220
219
  cr
221
220
  cry
222
221
  cs
222
+ c's
223
223
  cu
224
224
  currently
225
225
  cv
@@ -227,9 +227,10 @@ cx
227
227
  cy
228
228
  cz
229
229
  d
230
+ d
230
231
  dare
231
- daren't
232
232
  darent
233
+ daren't
233
234
  date
234
235
  de
235
236
  dear
@@ -240,8 +241,8 @@ despite
240
241
  detail
241
242
  did
242
243
  didn
243
- didn't
244
244
  didnt
245
+ didn't
245
246
  differ
246
247
  different
247
248
  differently
@@ -252,13 +253,13 @@ dm
252
253
  do
253
254
  does
254
255
  doesn
255
- doesn't
256
256
  doesnt
257
+ doesn't
257
258
  doing
258
259
  don
259
- don't
260
260
  done
261
261
  dont
262
+ don't
262
263
  doubtful
263
264
  down
264
265
  downed
@@ -412,41 +413,40 @@ gw
412
413
  gy
413
414
  h
414
415
  had
415
- hadn't
416
416
  hadnt
417
+ hadn't
417
418
  half
418
419
  happens
419
420
  hardly
420
421
  has
421
422
  hasn
422
- hasn't
423
423
  hasnt
424
+ hasn't
424
425
  have
425
426
  haven
426
- haven't
427
427
  havent
428
+ haven't
428
429
  having
429
430
  he
430
- he'd
431
- he'll
432
- he's
433
431
  hed
432
+ he'd
434
433
  hell
434
+ he'll
435
435
  hello
436
436
  help
437
437
  hence
438
438
  her
439
439
  here
440
- here's
441
440
  hereafter
442
441
  hereby
443
442
  herein
444
443
  heres
444
+ here's
445
445
  hereupon
446
446
  hers
447
447
  herself
448
- herse”
449
448
  hes
449
+ he's
450
450
  hi
451
451
  hid
452
452
  high
@@ -454,7 +454,6 @@ higher
454
454
  highest
455
455
  him
456
456
  himself
457
- himse”
458
457
  his
459
458
  hither
460
459
  hk
@@ -464,11 +463,11 @@ home
464
463
  homepage
465
464
  hopefully
466
465
  how
466
+ howbeit
467
467
  how'd
468
+ however
468
469
  how'll
469
470
  how's
470
- howbeit
471
- however
472
471
  hr
473
472
  ht
474
473
  htm
@@ -477,19 +476,18 @@ http
477
476
  hu
478
477
  hundred
479
478
  i
480
- i'd
481
- i'll
482
- i'm
483
- i've
484
479
  i.e.
485
480
  id
481
+ i'd
486
482
  ie
487
483
  if
488
484
  ignored
489
485
  ii
490
486
  il
491
487
  ill
488
+ i'll
492
489
  im
490
+ i'm
493
491
  immediate
494
492
  immediately
495
493
  importance
@@ -521,18 +519,18 @@ iq
521
519
  ir
522
520
  is
523
521
  isn
524
- isn't
525
522
  isnt
523
+ isn't
526
524
  it
527
- it'd
528
- it'll
529
- it's
530
525
  itd
526
+ it'd
531
527
  itll
528
+ it'll
532
529
  its
530
+ it's
533
531
  itself
534
- itse”
535
532
  ive
533
+ i've
536
534
  j
537
535
  je
538
536
  jm
@@ -578,8 +576,8 @@ length
578
576
  less
579
577
  lest
580
578
  let
581
- let's
582
579
  lets
580
+ let's
583
581
  li
584
582
  like
585
583
  liked
@@ -589,6 +587,7 @@ line
589
587
  little
590
588
  lk
591
589
  ll
590
+ ll
592
591
  long
593
592
  longer
594
593
  longest
@@ -605,6 +604,7 @@ lu
605
604
  lv
606
605
  ly
607
606
  m
607
+ m
608
608
  ma
609
609
  made
610
610
  mainly
@@ -615,8 +615,8 @@ man
615
615
  many
616
616
  may
617
617
  maybe
618
- mayn't
619
618
  maynt
619
+ mayn't
620
620
  mc
621
621
  md
622
622
  me
@@ -632,9 +632,9 @@ mg
632
632
  mh
633
633
  microsoft
634
634
  might
635
- might've
636
- mightn't
637
635
  mightnt
636
+ mightn't
637
+ might've
638
638
  mil
639
639
  mill
640
640
  million
@@ -662,17 +662,17 @@ mu
662
662
  much
663
663
  mug
664
664
  must
665
- must've
666
- mustn't
667
665
  mustnt
666
+ mustn't
667
+ must've
668
668
  mv
669
669
  mw
670
670
  mx
671
671
  my
672
672
  myself
673
- myse”
674
673
  mz
675
674
  n
675
+ n't
676
676
  na
677
677
  name
678
678
  namely
@@ -687,8 +687,8 @@ necessary
687
687
  need
688
688
  needed
689
689
  needing
690
- needn't
691
690
  neednt
691
+ needn't
692
692
  needs
693
693
  neither
694
694
  net
@@ -708,12 +708,12 @@ nine
708
708
  ninety
709
709
  nl
710
710
  no
711
- no-one
712
711
  nobody
713
712
  non
714
713
  none
715
714
  nonetheless
716
715
  noone
716
+ no-one
717
717
  nor
718
718
  normally
719
719
  nos
@@ -726,6 +726,7 @@ now
726
726
  nowhere
727
727
  np
728
728
  nr
729
+ n't
729
730
  nu
730
731
  null
731
732
  number
@@ -749,8 +750,8 @@ omitted
749
750
  on
750
751
  once
751
752
  one
752
- one's
753
753
  ones
754
+ one's
754
755
  only
755
756
  onto
756
757
  open
@@ -769,8 +770,8 @@ other
769
770
  others
770
771
  otherwise
771
772
  ought
772
- oughtn't
773
773
  oughtnt
774
+ oughtn't
774
775
  our
775
776
  ours
776
777
  ourselves
@@ -848,6 +849,7 @@ ran
848
849
  rather
849
850
  rd
850
851
  re
852
+ re
851
853
  readily
852
854
  really
853
855
  reasonably
@@ -876,6 +878,7 @@ ru
876
878
  run
877
879
  rw
878
880
  s
881
+ s
879
882
  sa
880
883
  said
881
884
  same
@@ -912,20 +915,20 @@ several
912
915
  sg
913
916
  sh
914
917
  shall
915
- shan't
916
918
  shant
919
+ shan't
917
920
  she
918
- she'd
919
- she'll
920
- she's
921
921
  shed
922
+ she'd
922
923
  shell
924
+ she'll
923
925
  shes
926
+ she's
924
927
  should
925
- should've
926
928
  shouldn
927
- shouldn't
928
929
  shouldnt
930
+ shouldn't
931
+ should've
929
932
  show
930
933
  showed
931
934
  showing
@@ -992,7 +995,6 @@ sy
992
995
  system
993
996
  sz
994
997
  t
995
- t's
996
998
  take
997
999
  taken
998
1000
  taking
@@ -1011,12 +1013,12 @@ thank
1011
1013
  thanks
1012
1014
  thanx
1013
1015
  that
1014
- that'll
1015
- that's
1016
- that've
1017
1016
  thatll
1017
+ that'll
1018
1018
  thats
1019
+ that's
1019
1020
  thatve
1021
+ that've
1020
1022
  the
1021
1023
  their
1022
1024
  theirs
@@ -1025,33 +1027,33 @@ themselves
1025
1027
  then
1026
1028
  thence
1027
1029
  there
1028
- there'd
1029
- there'll
1030
- there're
1031
- there's
1032
- there've
1033
1030
  thereafter
1034
1031
  thereby
1035
1032
  thered
1033
+ there'd
1036
1034
  therefore
1037
1035
  therein
1038
1036
  therell
1037
+ there'll
1039
1038
  thereof
1040
1039
  therere
1040
+ there're
1041
1041
  theres
1042
+ there's
1042
1043
  thereto
1043
1044
  thereupon
1044
1045
  thereve
1046
+ there've
1045
1047
  these
1046
1048
  they
1047
- they'd
1048
- they'll
1049
- they're
1050
- they've
1051
1049
  theyd
1050
+ they'd
1052
1051
  theyll
1052
+ they'll
1053
1053
  theyre
1054
+ they're
1054
1055
  theyve
1056
+ they've
1055
1057
  thick
1056
1058
  thin
1057
1059
  thing
@@ -1080,6 +1082,7 @@ til
1080
1082
  till
1081
1083
  tip
1082
1084
  tis
1085
+ tis
1083
1086
  tj
1084
1087
  tk
1085
1088
  tm
@@ -1101,6 +1104,7 @@ truly
1101
1104
  try
1102
1105
  trying
1103
1106
  ts
1107
+ t's
1104
1108
  tt
1105
1109
  turn
1106
1110
  turned
@@ -1109,6 +1113,7 @@ turns
1109
1113
  tv
1110
1114
  tw
1111
1115
  twas
1116
+ twas
1112
1117
  twelve
1113
1118
  twenty
1114
1119
  twice
@@ -1151,6 +1156,7 @@ value
1151
1156
  various
1152
1157
  vc
1153
1158
  ve
1159
+ ve
1154
1160
  versus
1155
1161
  very
1156
1162
  vg
@@ -1169,53 +1175,53 @@ wanting
1169
1175
  wants
1170
1176
  was
1171
1177
  wasn
1172
- wasn't
1173
1178
  wasnt
1179
+ wasn't
1174
1180
  way
1175
1181
  ways
1176
1182
  we
1177
- we'd
1178
- we'll
1179
- we're
1180
- we've
1181
1183
  web
1182
1184
  webpage
1183
1185
  website
1184
1186
  wed
1187
+ we'd
1185
1188
  welcome
1186
1189
  well
1190
+ we'll
1187
1191
  wells
1188
1192
  went
1189
1193
  were
1194
+ we're
1190
1195
  weren
1191
- weren't
1192
1196
  werent
1197
+ weren't
1193
1198
  weve
1199
+ we've
1194
1200
  wf
1195
1201
  what
1196
1202
  what'd
1197
- what'll
1198
- what's
1199
- what've
1200
1203
  whatever
1201
1204
  whatll
1205
+ what'll
1202
1206
  whats
1207
+ what's
1203
1208
  whatve
1209
+ what've
1204
1210
  when
1211
+ whence
1205
1212
  when'd
1213
+ whenever
1206
1214
  when'll
1207
1215
  when's
1208
- whence
1209
- whenever
1210
1216
  where
1211
- where'd
1212
- where'll
1213
- where's
1214
1217
  whereafter
1215
1218
  whereas
1216
1219
  whereby
1220
+ where'd
1217
1221
  wherein
1222
+ where'll
1218
1223
  wheres
1224
+ where's
1219
1225
  whereupon
1220
1226
  wherever
1221
1227
  whether
@@ -1226,16 +1232,16 @@ whilst
1226
1232
  whim
1227
1233
  whither
1228
1234
  who
1229
- who'd
1230
- who'll
1231
- who's
1232
1235
  whod
1236
+ who'd
1233
1237
  whoever
1234
1238
  whole
1235
1239
  wholl
1240
+ who'll
1236
1241
  whom
1237
1242
  whomever
1238
1243
  whos
1244
+ who's
1239
1245
  whose
1240
1246
  why
1241
1247
  why'd
@@ -1250,9 +1256,9 @@ with
1250
1256
  within
1251
1257
  without
1252
1258
  won
1253
- won't
1254
1259
  wonder
1255
1260
  wont
1261
+ won't
1256
1262
  words
1257
1263
  work
1258
1264
  worked
@@ -1260,10 +1266,10 @@ working
1260
1266
  works
1261
1267
  world
1262
1268
  would
1263
- would've
1264
1269
  wouldn
1265
- wouldn't
1266
1270
  wouldnt
1271
+ wouldn't
1272
+ would've
1267
1273
  ws
1268
1274
  www
1269
1275
  x
@@ -1274,25 +1280,25 @@ years
1274
1280
  yes
1275
1281
  yet
1276
1282
  you
1277
- you'd
1278
- you'll
1279
- you're
1280
- you've
1281
1283
  youd
1284
+ you'd
1282
1285
  youll
1286
+ you'll
1283
1287
  young
1284
1288
  younger
1285
1289
  youngest
1286
1290
  your
1287
1291
  youre
1292
+ you're
1288
1293
  yours
1289
1294
  yourself
1290
1295
  yourselves
1291
1296
  youve
1297
+ you've
1292
1298
  yt
1293
1299
  yu
1294
1300
  z
1295
1301
  za
1296
1302
  zero
1297
1303
  zm
1298
- zr
1304
+ zr
PgsFile/PgsFile.py CHANGED
@@ -3795,4 +3795,60 @@ def perform_liwc_zh(dic_path, file_path, output_excel_path):
3795
3795
 
3796
3796
  import pandas as pd
3797
3797
  df = pd.DataFrame(data,columns=[u'类别', u'出现词种数', u'占词表百分比', u'出现词次', u'总词次', u'覆盖率', u'例词'])
3798
- df.to_excel(output_excel_path,'sheet1',index=False)
3798
+ df.to_excel(output_excel_path,'sheet1',index=False)
3799
+
3800
+
3801
+ import math
3802
+ from collections import defaultdict
3803
+ def calculate_log_likelihood(target_count, reference_count, total_target, total_reference):
3804
+ """Calculate the log-likelihood of a word being a keyword using absolute frequencies."""
3805
+ # Calculate expected frequencies
3806
+ total_combined = total_target + total_reference
3807
+ expected_target = (target_count + reference_count) * (total_target / total_combined)
3808
+ expected_reference = (target_count + reference_count) * (total_reference / total_combined)
3809
+
3810
+ # Calculate log-likelihood
3811
+ ll = 0.0
3812
+ if target_count > 0:
3813
+ ll += target_count * math.log(target_count / expected_target)
3814
+ if reference_count > 0:
3815
+ ll += reference_count * math.log(reference_count / expected_reference)
3816
+
3817
+ return ll * 2 # Return G^2 statistic
3818
+
3819
+ def extract_keywords_en(target_text, top_n=10):
3820
+ """Extract keywords from target text using log-likelihood with absolute reference frequencies."""
3821
+ # Example usage
3822
+ my_dic_path = get_library_location("PgsFile")+"/PgsFile/models/dics/unigram_freq_only.json"
3823
+ reference_freq = get_data_json(my_dic_path)
3824
+ # Tokenize target text and preserve original case
3825
+ original_words = word_tokenize2(target_text)
3826
+ lower_words = [w.lower() for w in original_words if w.lower() not in BigPunctuation and w.lower() not in get_stopwords()]
3827
+ total_target = len(lower_words)
3828
+
3829
+ # Calculate target word frequencies
3830
+ target_word_freq = defaultdict(int)
3831
+ word_case_mapping = {}
3832
+ for orig_word, lower_word in zip(original_words, [w.lower() for w in original_words]):
3833
+ if lower_word in lower_words:
3834
+ target_word_freq[lower_word] += 1
3835
+ if lower_word not in word_case_mapping:
3836
+ word_case_mapping[lower_word] = orig_word
3837
+
3838
+ # Calculate total reference frequency
3839
+ total_reference = sum(reference_freq.values())
3840
+
3841
+ # Calculate log-likelihood for each word
3842
+ keyword_scores = []
3843
+ for word, target_count in target_word_freq.items():
3844
+ reference_count = reference_freq.get(word, 0)
3845
+ ll = calculate_log_likelihood(target_count, reference_count, total_target, total_reference)
3846
+ relative_freq = target_count / total_target
3847
+ original_word = word_case_mapping.get(word, word)
3848
+ keyword_scores.append((original_word, target_count, relative_freq, ll))
3849
+
3850
+ # Sort keywords by log-likelihood score
3851
+ keyword_scores.sort(key=lambda x: x[3], reverse=True)
3852
+
3853
+ # Return top N keywords
3854
+ return keyword_scores[:top_n]
PgsFile/__init__.py CHANGED
@@ -49,7 +49,7 @@ from .PgsFile import ngrams, bigrams, trigrams, everygrams, compute_similarity,
49
49
  from .PgsFile import word_list, batch_word_list
50
50
  from .PgsFile import cs, cs1, sent_tokenize, word_tokenize, word_tokenize2
51
51
  from .PgsFile import word_lemmatize, word_POS, word_NER
52
- from .PgsFile import extract_noun_phrases, get_LLMs_prompt
52
+ from .PgsFile import extract_noun_phrases, get_LLMs_prompt, extract_keywords_en
53
53
  from .PgsFile import extract_dependency_relations, extract_dependency_relations_full
54
54
  from .PgsFile import predict_category
55
55