wcc-text-analysis 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/lib/wcc/text_analysis.rb +15 -8
- data/lib/wcc/text_analysis/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66a18a3f32298d0124f6e045f4e83db791fd2d67502c327be659fed2ab82ffad
|
4
|
+
data.tar.gz: fee8ce474ebaa139fc0ea4fb8fa706cdb5646fe2b9b3eda768ed8ef23994d137
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e69ce56fe1ffec9d057e9694ff0b45cda625e894b42abca76c94c8b14992c94dbdcca4045be5452c50f496877cd2842ee9489027fde95a55d7a418057f488e63
|
7
|
+
data.tar.gz: 88730022fd77784a9ae04c34ff7ddc32dff8c75414065ed53e078e1471a2af980f495b5587c62753252d156ad1da12cb5f1569d4e073e798300673083ef781a2
|
data/.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
Gemfile.lock
|
1
|
+
Gemfile.lock
|
2
|
+
pkg/*
|
data/lib/wcc/text_analysis.rb
CHANGED
@@ -8,11 +8,22 @@ module WCC
|
|
8
8
|
|
9
9
|
attr_reader :normalized, :stripped
|
10
10
|
|
11
|
-
def self.extract_terms(
|
12
|
-
File.read(
|
11
|
+
def self.extract_terms(db_file)
|
12
|
+
File.read(
|
13
|
+
File.join(File.dirname(__FILE__), '../../db', db_file),
|
14
|
+
).split("\n")
|
13
15
|
end
|
14
16
|
|
15
|
-
|
17
|
+
# Stopwords from http://www.ranks.nl/stopwords
|
18
|
+
def self.default_stopwords
|
19
|
+
@default_stopwords ||= extract_terms('stop_words.txt')
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.default_exclusions
|
23
|
+
@default_exclusions ||= extract_terms('transcript_exclusions.txt')
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize(string, stop_words: self.class.default_stopwords)
|
16
27
|
@original = string
|
17
28
|
@stop_words = stop_words
|
18
29
|
end
|
@@ -61,11 +72,7 @@ module WCC
|
|
61
72
|
end
|
62
73
|
|
63
74
|
def remove_ignored_tokens(string)
|
64
|
-
string - (@stop_words +
|
75
|
+
string - (@stop_words + self.class.default_exclusions)
|
65
76
|
end
|
66
|
-
|
67
|
-
# Stopwords from http://www.ranks.nl/stopwords
|
68
|
-
STOPWORDS = extract_terms("stop_words").freeze
|
69
|
-
EXCLUSIONS = extract_terms("transcript_exclusions").freeze
|
70
77
|
end
|
71
78
|
end
|