lingua-it-readability 1.0.5 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +2 -2
- data/Rakefile +0 -2
- data/lib/lingua/it/readability/version.rb +1 -1
- data/lib/lingua/it/readability.rb +16 -5
- data/lib/lingua/it/sentence.rb +33 -9
- data/lib/lingua/it/syllable.rb +15 -15
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 470c3c7ea7eb28b6f52bc516d9f4a161d287c1e8
|
|
4
|
+
data.tar.gz: d5e4a825918c07682500ab93d3e64be02ea8fb53
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9b4a7e504931bd2dac6d06dad98e41dcd992a0135b55249b099ff5d582933a652be4ef8e9c190678606f1f1a83c6186a083b3eaf86d86be25bce3bf52b5086dd
|
|
7
|
+
data.tar.gz: 2feae4565d24deb075bac0ee79e11c9c6652a4940a8e408ad1c4896114b8370fe31ce4f5901a161fdf833d3900cd3e819a4329caf8f233119b5d2ed64255ef5c
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
#### 1.1.0 - 2016-02-09
|
|
2
|
+
###### Added
|
|
3
|
+
- Optimized regex, greatly improved overall performance
|
|
4
|
+
- Possibility to add custom symbols as end delimiter for sentences
|
|
5
|
+
- Readme usage section
|
|
6
|
+
###### Fixed
|
|
7
|
+
- Minor bugs
|
|
8
|
+
|
|
1
9
|
#### 1.0.4 - 2016-02-09
|
|
2
10
|
###### Added
|
|
3
11
|
- Readme usage section
|
data/README.md
CHANGED
|
@@ -32,9 +32,9 @@ report.gulpease # 59
|
|
|
32
32
|
report.flesch # 36.92
|
|
33
33
|
report.report # a formatted summary of statistics and measures
|
|
34
34
|
|
|
35
|
-
# accept
|
|
35
|
+
# accept custom symbols as delimiters
|
|
36
36
|
text = "Lista:\n- Gennaio;\n- Febbraio;"
|
|
37
|
-
report = Lingua::IT::Readability.new(text)
|
|
37
|
+
report = Lingua::IT::Readability.new(text, ':', '-')
|
|
38
38
|
report.num_sentences # 3
|
|
39
39
|
report.num_words # 3
|
|
40
40
|
report.num_syllables # 8
|
data/Rakefile
CHANGED
|
@@ -10,19 +10,25 @@ module Lingua
|
|
|
10
10
|
module IT
|
|
11
11
|
class Readability
|
|
12
12
|
attr_reader :text
|
|
13
|
-
attr_reader :
|
|
14
|
-
attr_reader :type
|
|
13
|
+
attr_reader :sent
|
|
15
14
|
attr_reader :paragraphs
|
|
16
15
|
attr_reader :sentences
|
|
17
16
|
attr_reader :words
|
|
18
17
|
attr_reader :frequencies
|
|
19
18
|
|
|
20
19
|
# Initialize the sample with +text+
|
|
21
|
-
def initialize(text = '',
|
|
20
|
+
def initialize(text = '', *delimiters)
|
|
21
|
+
|
|
22
|
+
@sent = Lingua::IT::Sentence
|
|
23
|
+
if(!delimiters.empty?)
|
|
24
|
+
@sent.delimiter(delimiters)
|
|
25
|
+
else
|
|
26
|
+
@sent.reset_delimiter
|
|
27
|
+
end
|
|
28
|
+
|
|
22
29
|
@text = text.dup
|
|
23
|
-
@type = type
|
|
24
30
|
@paragraphs = Lingua::IT::Paragraph.paragraphs(self.text)
|
|
25
|
-
@sentences =
|
|
31
|
+
@sentences = @sent.sentences(self.text)
|
|
26
32
|
@words = []
|
|
27
33
|
@frequencies = {}
|
|
28
34
|
@frequencies.default = 0
|
|
@@ -30,6 +36,11 @@ module Lingua
|
|
|
30
36
|
count_words
|
|
31
37
|
end
|
|
32
38
|
|
|
39
|
+
# Reset Lingua::IT::Sentence symbols delimiter cache
|
|
40
|
+
def reset_delimiter
|
|
41
|
+
@sent.reset_delimiter
|
|
42
|
+
end
|
|
43
|
+
|
|
33
44
|
# The number of paragraphs in the sample. A paragraph is defined as a
|
|
34
45
|
# newline followed by one or more empty or whitespace-only lines.
|
|
35
46
|
def num_paragraphs
|
data/lib/lingua/it/sentence.rb
CHANGED
|
@@ -8,6 +8,8 @@ module Lingua
|
|
|
8
8
|
class << self
|
|
9
9
|
attr_reader :abbreviations
|
|
10
10
|
attr_reader :abbr_regex
|
|
11
|
+
attr_reader :delimiters
|
|
12
|
+
attr_reader :delim_regex
|
|
11
13
|
end
|
|
12
14
|
|
|
13
15
|
# Common abbreviations
|
|
@@ -16,22 +18,18 @@ module Lingua
|
|
|
16
18
|
MONTHS = %w(gen feb mar apr mag giu lug ago set sett ott nov dic) unless defined?(MONTHS)
|
|
17
19
|
DAYS = %w(lun mar mer gio ven sab dom) unless defined?(DAYS)
|
|
18
20
|
|
|
19
|
-
#
|
|
20
|
-
|
|
21
|
-
'standard' => /["']?[A-Z][^.?!]+((?![.?!]['"]?\s["']?[A-Z][^.?!]).)+[.?!'"]+/,
|
|
22
|
-
'scientific' => /["']?[A-Z][^.;:?!]+((?![.;:?!]['"]?\s["']?[A-Z][^.;:?!]).)+[.;:?!'"]+/
|
|
23
|
-
}
|
|
24
|
-
TYPES.default_proc = proc { |hash, key| hash[key] = /["']?[A-Z][^.?!]+((?![.?!]['"]?\s["']?[A-Z][^.?!]).)+[.?!'"]+/ }
|
|
21
|
+
# Standard delimiters
|
|
22
|
+
STD = %w(. ? !)
|
|
25
23
|
|
|
26
24
|
# Split up in sentences, use 0002 as a temporary end mark for
|
|
27
25
|
# the abbreviations found, even if the regex should be enough
|
|
28
26
|
# to recognize real stop point from abbreviations ones.
|
|
29
27
|
# A sentences should definetly end marked only by a . or a ?
|
|
30
28
|
# or a !
|
|
31
|
-
def self.sentences(text
|
|
29
|
+
def self.sentences(text)
|
|
32
30
|
txt = text.dup
|
|
33
31
|
txt.gsub!(/\b(#{@abbr_regex})(\.)\B/i, '\10002')
|
|
34
|
-
txt.gsub!(
|
|
32
|
+
txt.gsub!(/["']?[A-Z][^#{@delim_regex}]+((?![#{@delim_regex}]['"]?\s["']?[A-Z][^#{@delim_regex}]).)+[#{@delim_regex}'"]+/, '\2\001')
|
|
35
33
|
txt.gsub!(/\b(#{@abbr_regex})(0002)/i, '\1.')
|
|
36
34
|
txt.split(/01/).map { |sentence| sentence.strip }
|
|
37
35
|
end
|
|
@@ -44,6 +42,20 @@ module Lingua
|
|
|
44
42
|
@abbreviations
|
|
45
43
|
end
|
|
46
44
|
|
|
45
|
+
# Add symbols to sentence delimters
|
|
46
|
+
def self.delimiter(*delimiters)
|
|
47
|
+
@delimiters += delimiters
|
|
48
|
+
@delimiters.uniq!
|
|
49
|
+
set_delim_regex!
|
|
50
|
+
@delimiters
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def self.reset_delimiter
|
|
54
|
+
@delimiters = STD
|
|
55
|
+
set_delim_regex!
|
|
56
|
+
@delimiters
|
|
57
|
+
end
|
|
58
|
+
|
|
47
59
|
private
|
|
48
60
|
# Utility method, chain up all abbreviations constants arrays
|
|
49
61
|
def self.initialize_abbreviations!
|
|
@@ -57,8 +69,20 @@ module Lingua
|
|
|
57
69
|
@abbr_regex = "#{@abbreviations.join('|')}"
|
|
58
70
|
end
|
|
59
71
|
|
|
60
|
-
|
|
72
|
+
# Utility method, chain up all delimiters constants arrays
|
|
73
|
+
def self.initialize_delimiters!
|
|
74
|
+
@delimiters = STD
|
|
75
|
+
set_delim_regex!
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Utility method, join all elements of the delimiters arrays
|
|
79
|
+
# without a separator, making suitable for a regex.
|
|
80
|
+
def self.set_delim_regex!
|
|
81
|
+
@delim_regex = "#{@delimiters.join('\\')}"
|
|
82
|
+
end
|
|
61
83
|
|
|
84
|
+
initialize_abbreviations!
|
|
85
|
+
initialize_delimiters!
|
|
62
86
|
end
|
|
63
87
|
end
|
|
64
88
|
end
|
data/lib/lingua/it/syllable.rb
CHANGED
|
@@ -24,19 +24,19 @@ module Lingua
|
|
|
24
24
|
words = text.dup.split(/[^a-zA-Zàèéìòù'0-9]+/)
|
|
25
25
|
hyphenation = ""
|
|
26
26
|
words.each do |word|
|
|
27
|
-
word.gsub!(/(#{V})(#{S})/
|
|
28
|
-
word.gsub!(/(#{V})(#{Z})/
|
|
29
|
-
word.gsub!(/(#{X})(#{V})/
|
|
30
|
-
word.gsub!(/(#{C})(#{V})(#{V})(#{Y})
|
|
31
|
-
word.gsub!(/(#{V})([bcfgptv][lr])/
|
|
32
|
-
word.gsub!(/(#{V})([cg]h)/
|
|
33
|
-
word.gsub!(/(#{V})(gn)/
|
|
34
|
-
word.gsub!(/(#{C})\1/
|
|
35
|
-
word.gsub!(/(s#{C})/
|
|
36
|
-
1 while word.gsub!(/(#{V}*#{C}+#{V}+)(#{C}#{V})/
|
|
37
|
-
1 while word.gsub!(/(#{V}*#{C}+#{V}+#{C})(#{C})/
|
|
38
|
-
word.gsub!(/^(#{V}+#{C})(#{C})/
|
|
39
|
-
word.gsub!(/^(#{V}+)(#{C}#{V})/
|
|
27
|
+
word.gsub!(/(#{V})(#{S})/io, '\1=iu=t')
|
|
28
|
+
word.gsub!(/(#{V})(#{Z})/io, '\1=\2')
|
|
29
|
+
word.gsub!(/(#{X})(#{V})/io, '\1=\2')
|
|
30
|
+
word.gsub!(/(#{C})(#{V})(#{V})(#{Y})/io, '\1\2=\3=\4')
|
|
31
|
+
word.gsub!(/(#{V})([bcfgptv][lr])/io, '\1=\2')
|
|
32
|
+
word.gsub!(/(#{V})([cg]h)/io, '\1=\2')
|
|
33
|
+
word.gsub!(/(#{V})(gn)/io, '\1=\2')
|
|
34
|
+
word.gsub!(/(#{C})\1/io, '\1=\1')
|
|
35
|
+
word.gsub!(/(s#{C})/io, '=\1')
|
|
36
|
+
1 while word.gsub!(/(#{V}*#{C}+#{V}+)(#{C}#{V})/io, '\1=\2')
|
|
37
|
+
1 while word.gsub!(/(#{V}*#{C}+#{V}+#{C})(#{C})/io, '\1=\2')
|
|
38
|
+
word.gsub!(/^(#{V}+#{C})(#{C})/io, '\1=\2')
|
|
39
|
+
word.gsub!(/^(#{V}+)(#{C}#{V})/io, '\1=\2')
|
|
40
40
|
word.sub!(/^=/, '')
|
|
41
41
|
word.sub!(/=$/, '')
|
|
42
42
|
word.gsub!(/=+/,'=');
|
|
@@ -44,8 +44,8 @@ module Lingua
|
|
|
44
44
|
word.gsub!(/(le)([oa]n)/i, '\1=\2')
|
|
45
45
|
word.gsub!(/(le)([oa])(an)/i, '\1=\2=\3')
|
|
46
46
|
word.gsub!(/(spe)=(le)=(o)/i, '\1=\2\3')
|
|
47
|
-
word.gsub!(/([gd]i)=(#{V})/
|
|
48
|
-
word.gsub!(/(ni)=(#{V})/
|
|
47
|
+
word.gsub!(/([gd]i)=(#{V})/io, '\1\2')
|
|
48
|
+
word.gsub!(/(ni)=(#{V})/io, '\1\2')
|
|
49
49
|
word.gsub!(/=(e)=(l)/i, '\1\2')
|
|
50
50
|
hyphenation += "#{word}="
|
|
51
51
|
end
|