demystify 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +64 -0
- data/lib/demystify/version.rb +1 -1
- data/lib/demystify.rb +35 -20
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 70032c0a2eaf6eeecedc921876cf1760e160c7fb
|
4
|
+
data.tar.gz: d855804fb3f43777a71cac79dd8a54b5bd990896
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 64e2cc110909035a29df6e05e46235a0bcc7db8faf4ea8c412a25149da1a03af1684bf887d7e995ea25afafbffacfd3ceedd6de469eb0cb7d77c8dfad3d2b607
|
7
|
+
data.tar.gz: 43d451c3267e88b6aa1bda6792310066a4d1666807fdb3bf5741eb138e299db508f23bd211a5c88a5cd7a7f3eb241e543ed6ca95ad2a87f789cc1fdb60f47192
|
data/README.md
CHANGED
@@ -1 +1,65 @@
|
|
1
1
|
#Demystify
|
2
|
+
|
3
|
+
Demystify is a gem to help you deal with text, for text analysis or NLP projects.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'demystify'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install demystify
|
20
|
+
|
21
|
+
##Usage
|
22
|
+
|
23
|
+
Make a Text object using your text file.
|
24
|
+
```ruby
|
25
|
+
text = Demystify::Text.new('./my_text_file.txt')
|
26
|
+
```
|
27
|
+
|
28
|
+
Get an array of all characters, words or sentences:
|
29
|
+
```ruby
|
30
|
+
text.chars
|
31
|
+
text.words
|
32
|
+
text.sentences
|
33
|
+
```
|
34
|
+
|
35
|
+
Count the number of all characters, spaces, new lines, non-whitespace characters,
|
36
|
+
punctuation, symbols, letters, non-letters, words and sentences:
|
37
|
+
```ruby
|
38
|
+
text.char_count
|
39
|
+
text.spaces_count
|
40
|
+
text.new_line_count
|
41
|
+
text.non_whitespace_char_count
|
42
|
+
text.punctuation_count
|
43
|
+
text.symbol_count
|
44
|
+
text.letter_count
|
45
|
+
text.non_letter_count
|
46
|
+
text.word_count
|
47
|
+
text.sentence_count
|
48
|
+
```
|
49
|
+
|
50
|
+
Check for the number of occurrences of a particular sequence of characters:
|
51
|
+
```ruby
|
52
|
+
text.sequence_count(sequence)
|
53
|
+
```
|
54
|
+
|
55
|
+
Get the first word or last word of every sentence in an array:
|
56
|
+
```ruby
|
57
|
+
text.first_words
|
58
|
+
text.last_words
|
59
|
+
```
|
60
|
+
|
61
|
+
Get a hash of every word in the text of pointing to an array of all of its following or preceding words in the text:
|
62
|
+
```ruby
|
63
|
+
text.forwards_probability_hash
|
64
|
+
text.backwards_probability_hash
|
65
|
+
```
|
data/lib/demystify/version.rb
CHANGED
data/lib/demystify.rb
CHANGED
@@ -20,15 +20,22 @@ module Demystify
|
|
20
20
|
|
21
21
|
class Text
|
22
22
|
|
23
|
-
attr_accessor :content,
|
23
|
+
attr_accessor :content,
|
24
|
+
:chars,
|
25
|
+
:words,
|
26
|
+
:sentences,
|
27
|
+
:forwards_probability_hash,
|
28
|
+
:backwards_probability_hash,
|
29
|
+
:first_words,
|
30
|
+
:last_words
|
24
31
|
|
25
32
|
def initialize(file)
|
26
33
|
@content = open(file).read
|
27
34
|
@chars = @content.split("")
|
28
35
|
@words = @content.split(/[^[[:word:]]]+/)
|
29
|
-
|
30
|
-
|
31
|
-
|
36
|
+
make_sentences
|
37
|
+
make_probability_hashes
|
38
|
+
make_first_and_last_words
|
32
39
|
end
|
33
40
|
|
34
41
|
def char_count
|
@@ -108,33 +115,41 @@ module Demystify
|
|
108
115
|
@sentences.length
|
109
116
|
end
|
110
117
|
|
111
|
-
|
112
|
-
first_words = []
|
113
|
-
@sentences.each do |sentence|
|
114
|
-
first_words << sentence.first
|
115
|
-
end
|
116
|
-
first_words
|
117
|
-
end
|
118
|
+
private
|
118
119
|
|
119
|
-
def
|
120
|
-
|
120
|
+
def make_first_and_last_words
|
121
|
+
@first_words = []
|
122
|
+
@last_words = []
|
121
123
|
@sentences.each do |sentence|
|
122
|
-
|
124
|
+
split_sentence = sentence.split(" ")
|
125
|
+
@first_words << split_sentence.first
|
126
|
+
@last_words << split_sentence.last
|
123
127
|
end
|
124
|
-
last_words
|
125
128
|
end
|
126
129
|
|
127
|
-
private
|
128
|
-
|
129
130
|
def make_sentences
|
130
131
|
sentence_regex = /((?<=[a-z0-9)][.?!])|(?<=[a-z0-9][.?!]"))\s+(?="?[A-Z])/
|
131
132
|
sentences = @content.split(sentence_regex)
|
132
133
|
sentences.select!{|sentence| sentence.length > 1}
|
133
|
-
sentences.map{|sentence| sentence.chomp}
|
134
|
+
@sentences = sentences.map{|sentence| sentence.chomp}
|
135
|
+
end
|
136
|
+
|
137
|
+
def make_probability_hashes
|
138
|
+
@forwards_probability_hash = Hash.new { |h, k| h[k] = [] }
|
139
|
+
@backwards_probability_hash = Hash.new { |h, k| h[k] = [] }
|
140
|
+
@sentences.each do |sentence|
|
141
|
+
sentence_array = sentence.split(" ")
|
142
|
+
sentence_array.each_with_index do |word, i|
|
143
|
+
unless i == sentence_array.length - 1
|
144
|
+
@forwards_probability_hash[word] << sentence_array[i+1]
|
145
|
+
end
|
146
|
+
unless i == 0
|
147
|
+
@backwards_probability_hash[word] << sentence_array[i-1]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
134
151
|
end
|
135
152
|
|
136
153
|
end
|
137
154
|
|
138
155
|
end
|
139
|
-
|
140
|
-
something = Demystify::Text.new(File.join( File.dirname(__FILE__), '../sample1.txt'))
|