textclass-en 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: textclass-en
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract keywords from English text
|
|
5
|
+
Description-Content-Type: text/markdown
|
|
6
|
+
|
|
7
|
+
# textclass-en
|
|
8
|
+
A simple, fast Python library to extract keywords from English text.
|
|
9
|
+
|
|
10
|
+
## Quick Start
|
|
11
|
+
```python
|
|
12
|
+
import textclass_en
|
|
13
|
+
|
|
14
|
+
textclass_en.textclass("The quick brown fox jumped over the lazy dog") # result: ['quick', 'fox', 'jumped']
|
|
15
|
+
textclass_en.textclass("The quick brown fox jumped over the lazy dog", max_keywords=10) # result: ['quick', 'fox', 'jumped', 'lazy', 'dog']
|
|
16
|
+
textclass_en.textclass("The company 'OpenBrain' is about to go bankrupt because of their newest AI model.") # result: ['company', 'openbrain', 'bankrupt']
|
|
17
|
+
textclass_en.textclass("The company 'OpenBrain' is about to go bankrupt because of their newest AI model.", max_keywords=10) # result: ['company', 'openbrain', 'bankrupt', 'newest', 'ai', 'model']
|
|
18
|
+
```
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# textclass-en
|
|
2
|
+
A simple, fast Python library to extract keywords from English text.
|
|
3
|
+
|
|
4
|
+
## Quick Start
|
|
5
|
+
```python
|
|
6
|
+
import textclass_en
|
|
7
|
+
|
|
8
|
+
textclass_en.textclass("The quick brown fox jumped over the lazy dog") # result: ['quick', 'fox', 'jumped']
|
|
9
|
+
textclass_en.textclass("The quick brown fox jumped over the lazy dog", max_keywords=10) # result: ['quick', 'fox', 'jumped', 'lazy', 'dog']
|
|
10
|
+
textclass_en.textclass("The company 'OpenBrain' is about to go bankrupt because of their newest AI model.") # result: ['company', 'openbrain', 'bankrupt']
|
|
11
|
+
textclass_en.textclass("The company 'OpenBrain' is about to go bankrupt because of their newest AI model.", max_keywords=10) # result: ['company', 'openbrain', 'bankrupt', 'newest', 'ai', 'model']
|
|
12
|
+
```
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import textclass
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
stopwords = (
|
|
2
|
+
"a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost",
|
|
3
|
+
"alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst",
|
|
4
|
+
"amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere",
|
|
5
|
+
"are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming",
|
|
6
|
+
"been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between",
|
|
7
|
+
"beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co",
|
|
8
|
+
"computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down",
|
|
9
|
+
"due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty",
|
|
10
|
+
"enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except",
|
|
11
|
+
"few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly",
|
|
12
|
+
"forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has",
|
|
13
|
+
"hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
|
|
14
|
+
"hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in",
|
|
15
|
+
"inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
|
|
16
|
+
"latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill",
|
|
17
|
+
"mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name",
|
|
18
|
+
"namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
|
|
19
|
+
"nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only",
|
|
20
|
+
"onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own",
|
|
21
|
+
"part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed",
|
|
22
|
+
"seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere",
|
|
23
|
+
"six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes",
|
|
24
|
+
"somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them",
|
|
25
|
+
"themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
|
|
26
|
+
"thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three",
|
|
27
|
+
"through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards",
|
|
28
|
+
"twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was",
|
|
29
|
+
"we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter",
|
|
30
|
+
"whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while",
|
|
31
|
+
"whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within",
|
|
32
|
+
"without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "red", "green",
|
|
33
|
+
"blue", "magenta", "cyan", "teal", "brown", "orange", "white", "black", "pink"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def get_distance(s1, s2):
|
|
37
|
+
if len(s1) < len(s2):
|
|
38
|
+
return get_distance(s2, s1)
|
|
39
|
+
if not s2:
|
|
40
|
+
return len(s1)
|
|
41
|
+
previous_row = range(len(s2) + 1)
|
|
42
|
+
for i, c1 in enumerate(s1):
|
|
43
|
+
current_row = [i + 1]
|
|
44
|
+
for j, c2 in enumerate(s2):
|
|
45
|
+
insertions = previous_row[j + 1] + 1
|
|
46
|
+
deletions = current_row[j] + 1
|
|
47
|
+
substitutions = previous_row[j] + (c1 != c2)
|
|
48
|
+
current_row.append(min(insertions, deletions, substitutions))
|
|
49
|
+
previous_row = current_row
|
|
50
|
+
return previous_row[-1]
|
|
51
|
+
|
|
52
|
+
def textclass(st, max_keywords=3):
|
|
53
|
+
tokens = "".join(char for char in st.lower() if char.isalpha() or char.isspace()).split(" ")
|
|
54
|
+
tokens = [tok for tok in tokens if tok not in stopwords]
|
|
55
|
+
|
|
56
|
+
for i, tok in enumerate(tokens):
|
|
57
|
+
match = next((existing for existing in tokens[:i] if get_distance(tok, existing) <= 1), None)
|
|
58
|
+
tokens[i] = match if match else tok
|
|
59
|
+
|
|
60
|
+
freq = {}
|
|
61
|
+
for tok in tokens:
|
|
62
|
+
if tok in freq:
|
|
63
|
+
freq[tok] += 1
|
|
64
|
+
else:
|
|
65
|
+
freq[tok] = 1
|
|
66
|
+
|
|
67
|
+
return sorted(freq, key=freq.get, reverse=True)[:max_keywords]
|