lingua-it-readability 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5cf375da6bf5b5319b371b9296e70eaecdc8e08a
4
- data.tar.gz: 70271a6abd3616ce4b5f90eca6295a3229b129bf
3
+ metadata.gz: 470c3c7ea7eb28b6f52bc516d9f4a161d287c1e8
4
+ data.tar.gz: d5e4a825918c07682500ab93d3e64be02ea8fb53
5
5
  SHA512:
6
- metadata.gz: 994dbba2a9793d12c1b347e3c3b225e3b6d723f350c41ad4b903b784c8714a087a8934a68cd6ca9bc983f2b17cd8b6a30ba84e62291a6f5291ffecc821e8e690
7
- data.tar.gz: 99db5e027c07ba5ea001d250c7b10102f6002530b79839f4bf6474cd5c86651f1f386988483db5d7a57c0cd143e35b9597e1502cb96bf79513b4d1448dd70bdb
6
+ metadata.gz: 9b4a7e504931bd2dac6d06dad98e41dcd992a0135b55249b099ff5d582933a652be4ef8e9c190678606f1f1a83c6186a083b3eaf86d86be25bce3bf52b5086dd
7
+ data.tar.gz: 2feae4565d24deb075bac0ee79e11c9c6652a4940a8e408ad1c4896114b8370fe31ce4f5901a161fdf833d3900cd3e819a4329caf8f233119b5d2ed64255ef5c
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ #### 1.1.0 - 2016-02-09
2
+ ###### Added
3
+ - Optimized regex, greatly improved overall performance
4
+ - Possibility to add custom symbols as end delimiter for sentences
5
+ - Readme usage section
6
+ ###### Fixed
7
+ - Minor bugs
8
+
1
9
  #### 1.0.4 - 2016-02-09
2
10
  ###### Added
3
11
  - Readme usage section
data/README.md CHANGED
@@ -32,9 +32,9 @@ report.gulpease # 59
32
32
  report.flesch # 36.92
33
33
  report.report # a formatted summary of statistics and measures
34
34
 
35
- # accept type 'scientific' to treat list items separated by semicolons as sentences
35
+ # accept custom symbols as delimiters
36
36
  text = "Lista:\n- Gennaio;\n- Febbraio;"
37
- report = Lingua::IT::Readability.new(text)
37
+ report = Lingua::IT::Readability.new(text, ':', '-')
38
38
  report.num_sentences # 3
39
39
  report.num_words # 3
40
40
  report.num_syllables # 8
data/Rakefile CHANGED
@@ -1,5 +1,3 @@
1
- require "rubygems"
2
- require "rake"
3
1
  require "bundler/gem_tasks"
4
2
  require "rspec/core/rake_task"
5
3
 
@@ -1,7 +1,7 @@
1
1
  module Lingua
2
2
  module It
3
3
  module Readability
4
- VERSION = "1.0.5"
4
+ VERSION = "1.1.0"
5
5
  end
6
6
  end
7
7
  end
@@ -10,19 +10,25 @@ module Lingua
10
10
  module IT
11
11
  class Readability
12
12
  attr_reader :text
13
- attr_reader :text
14
- attr_reader :type
13
+ attr_reader :sent
15
14
  attr_reader :paragraphs
16
15
  attr_reader :sentences
17
16
  attr_reader :words
18
17
  attr_reader :frequencies
19
18
 
20
19
  # Initialize the sample with +text+
21
- def initialize(text = '', type = 'standard')
20
+ def initialize(text = '', *delimiters)
21
+
22
+ @sent = Lingua::IT::Sentence
23
+ if(!delimiters.empty?)
24
+ @sent.delimiter(delimiters)
25
+ else
26
+ @sent.reset_delimiter
27
+ end
28
+
22
29
  @text = text.dup
23
- @type = type
24
30
  @paragraphs = Lingua::IT::Paragraph.paragraphs(self.text)
25
- @sentences = Lingua::IT::Sentence.sentences(self.text, self.type)
31
+ @sentences = @sent.sentences(self.text)
26
32
  @words = []
27
33
  @frequencies = {}
28
34
  @frequencies.default = 0
@@ -30,6 +36,11 @@ module Lingua
30
36
  count_words
31
37
  end
32
38
 
39
+ # Reset Lingua::IT::Sentence symbols delimiter cache
40
+ def reset_delimiter
41
+ @sent.reset_delimiter
42
+ end
43
+
33
44
  # The number of paragraphs in the sample. A paragraph is defined as a
34
45
  # newline followed by one or more empty or whitespace-only lines.
35
46
  def num_paragraphs
@@ -8,6 +8,8 @@ module Lingua
8
8
  class << self
9
9
  attr_reader :abbreviations
10
10
  attr_reader :abbr_regex
11
+ attr_reader :delimiters
12
+ attr_reader :delim_regex
11
13
  end
12
14
 
13
15
  # Common abbreviations
@@ -16,22 +18,18 @@ module Lingua
16
18
  MONTHS = %w(gen feb mar apr mag giu lug ago set sett ott nov dic) unless defined?(MONTHS)
17
19
  DAYS = %w(lun mar mer gio ven sab dom) unless defined?(DAYS)
18
20
 
19
- # Text types
20
- TYPES = {
21
- 'standard' => /["']?[A-Z][^.?!]+((?![.?!]['"]?\s["']?[A-Z][^.?!]).)+[.?!'"]+/,
22
- 'scientific' => /["']?[A-Z][^.;:?!]+((?![.;:?!]['"]?\s["']?[A-Z][^.;:?!]).)+[.;:?!'"]+/
23
- }
24
- TYPES.default_proc = proc { |hash, key| hash[key] = /["']?[A-Z][^.?!]+((?![.?!]['"]?\s["']?[A-Z][^.?!]).)+[.?!'"]+/ }
21
+ # Standard delimiters
22
+ STD = %w(. ? !)
25
23
 
26
24
  # Split up in sentences, use 0002 as a temporary end mark for
27
25
  # the abbreviations found, even if the regex should be enough
28
26
  # to recognize real stop point from abbreviations ones.
29
27
  # A sentences should definetly end marked only by a . or a ?
30
28
  # or a !
31
- def self.sentences(text, type = 'standard')
29
+ def self.sentences(text)
32
30
  txt = text.dup
33
31
  txt.gsub!(/\b(#{@abbr_regex})(\.)\B/i, '\10002')
34
- txt.gsub!(/#{TYPES[type]}/, '\2\001')
32
+ txt.gsub!(/["']?[A-Z][^#{@delim_regex}]+((?![#{@delim_regex}]['"]?\s["']?[A-Z][^#{@delim_regex}]).)+[#{@delim_regex}'"]+/, '\2\001')
35
33
  txt.gsub!(/\b(#{@abbr_regex})(0002)/i, '\1.')
36
34
  txt.split(/01/).map { |sentence| sentence.strip }
37
35
  end
@@ -44,6 +42,20 @@ module Lingua
44
42
  @abbreviations
45
43
  end
46
44
 
45
+ # Add symbols to sentence delimters
46
+ def self.delimiter(*delimiters)
47
+ @delimiters += delimiters
48
+ @delimiters.uniq!
49
+ set_delim_regex!
50
+ @delimiters
51
+ end
52
+
53
+ def self.reset_delimiter
54
+ @delimiters = STD
55
+ set_delim_regex!
56
+ @delimiters
57
+ end
58
+
47
59
  private
48
60
  # Utility method, chain up all abbreviations constants arrays
49
61
  def self.initialize_abbreviations!
@@ -57,8 +69,20 @@ module Lingua
57
69
  @abbr_regex = "#{@abbreviations.join('|')}"
58
70
  end
59
71
 
60
- initialize_abbreviations!
72
+ # Utility method, chain up all delimiters constants arrays
73
+ def self.initialize_delimiters!
74
+ @delimiters = STD
75
+ set_delim_regex!
76
+ end
77
+
78
+ # Utility method, join all elements of the delimiters arrays
79
+ # without a separator, making suitable for a regex.
80
+ def self.set_delim_regex!
81
+ @delim_regex = "#{@delimiters.join('\\')}"
82
+ end
61
83
 
84
+ initialize_abbreviations!
85
+ initialize_delimiters!
62
86
  end
63
87
  end
64
88
  end
@@ -24,19 +24,19 @@ module Lingua
24
24
  words = text.dup.split(/[^a-zA-Zàèéìòù'0-9]+/)
25
25
  hyphenation = ""
26
26
  words.each do |word|
27
- word.gsub!(/(#{V})(#{S})/i, '\1=iu=t')
28
- word.gsub!(/(#{V})(#{Z})/i, '\1=\2')
29
- word.gsub!(/(#{X})(#{V})/i, '\1=\2')
30
- word.gsub!(/(#{C})(#{V})(#{V})(#{Y})/, '\1\2=\3=\4')
31
- word.gsub!(/(#{V})([bcfgptv][lr])/i, '\1=\2')
32
- word.gsub!(/(#{V})([cg]h)/i, '\1=\2')
33
- word.gsub!(/(#{V})(gn)/i, '\1=\2')
34
- word.gsub!(/(#{C})\1/i, '\1=\1')
35
- word.gsub!(/(s#{C})/i, '=\1')
36
- 1 while word.gsub!(/(#{V}*#{C}+#{V}+)(#{C}#{V})/i, '\1=\2')
37
- 1 while word.gsub!(/(#{V}*#{C}+#{V}+#{C})(#{C})/i, '\1=\2')
38
- word.gsub!(/^(#{V}+#{C})(#{C})/i, '\1=\2')
39
- word.gsub!(/^(#{V}+)(#{C}#{V})/i, '\1=\2')
27
+ word.gsub!(/(#{V})(#{S})/io, '\1=iu=t')
28
+ word.gsub!(/(#{V})(#{Z})/io, '\1=\2')
29
+ word.gsub!(/(#{X})(#{V})/io, '\1=\2')
30
+ word.gsub!(/(#{C})(#{V})(#{V})(#{Y})/io, '\1\2=\3=\4')
31
+ word.gsub!(/(#{V})([bcfgptv][lr])/io, '\1=\2')
32
+ word.gsub!(/(#{V})([cg]h)/io, '\1=\2')
33
+ word.gsub!(/(#{V})(gn)/io, '\1=\2')
34
+ word.gsub!(/(#{C})\1/io, '\1=\1')
35
+ word.gsub!(/(s#{C})/io, '=\1')
36
+ 1 while word.gsub!(/(#{V}*#{C}+#{V}+)(#{C}#{V})/io, '\1=\2')
37
+ 1 while word.gsub!(/(#{V}*#{C}+#{V}+#{C})(#{C})/io, '\1=\2')
38
+ word.gsub!(/^(#{V}+#{C})(#{C})/io, '\1=\2')
39
+ word.gsub!(/^(#{V}+)(#{C}#{V})/io, '\1=\2')
40
40
  word.sub!(/^=/, '')
41
41
  word.sub!(/=$/, '')
42
42
  word.gsub!(/=+/,'=');
@@ -44,8 +44,8 @@ module Lingua
44
44
  word.gsub!(/(le)([oa]n)/i, '\1=\2')
45
45
  word.gsub!(/(le)([oa])(an)/i, '\1=\2=\3')
46
46
  word.gsub!(/(spe)=(le)=(o)/i, '\1=\2\3')
47
- word.gsub!(/([gd]i)=(#{V})/i, '\1\2')
48
- word.gsub!(/(ni)=(#{V})/i, '\1\2')
47
+ word.gsub!(/([gd]i)=(#{V})/io, '\1\2')
48
+ word.gsub!(/(ni)=(#{V})/io, '\1\2')
49
49
  word.gsub!(/=(e)=(l)/i, '\1\2')
50
50
  hyphenation += "#{word}="
51
51
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lingua-it-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrea Giacomo Baldan