RubyGems - lingua-it-readability - Versions diffs - 1.0.5 → 1.1.0 - Mend

lingua-it-readability 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +8 -0
data/README.md +2 -2
data/Rakefile +0 -2
data/lib/lingua/it/readability/version.rb +1 -1
data/lib/lingua/it/readability.rb +16 -5
data/lib/lingua/it/sentence.rb +33 -9
data/lib/lingua/it/syllable.rb +15 -15
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5cf375da6bf5b5319b371b9296e70eaecdc8e08a
-  data.tar.gz: 70271a6abd3616ce4b5f90eca6295a3229b129bf
+  metadata.gz: 470c3c7ea7eb28b6f52bc516d9f4a161d287c1e8
+  data.tar.gz: d5e4a825918c07682500ab93d3e64be02ea8fb53
 SHA512:
-  metadata.gz: 994dbba2a9793d12c1b347e3c3b225e3b6d723f350c41ad4b903b784c8714a087a8934a68cd6ca9bc983f2b17cd8b6a30ba84e62291a6f5291ffecc821e8e690
-  data.tar.gz: 99db5e027c07ba5ea001d250c7b10102f6002530b79839f4bf6474cd5c86651f1f386988483db5d7a57c0cd143e35b9597e1502cb96bf79513b4d1448dd70bdb
+  metadata.gz: 9b4a7e504931bd2dac6d06dad98e41dcd992a0135b55249b099ff5d582933a652be4ef8e9c190678606f1f1a83c6186a083b3eaf86d86be25bce3bf52b5086dd
+  data.tar.gz: 2feae4565d24deb075bac0ee79e11c9c6652a4940a8e408ad1c4896114b8370fe31ce4f5901a161fdf833d3900cd3e819a4329caf8f233119b5d2ed64255ef5c

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,11 @@
+#### 1.1.0 - 2016-02-09
+###### Added
+- Optimized regex, greatly improved overall performance
+- Possibility to add custom symbols as end delimiter for sentences
+- Readme usage section
+###### Fixed
+- Minor bugs
 #### 1.0.4 - 2016-02-09
 ###### Added
 - Readme usage section

data/README.md CHANGED Viewed

@@ -32,9 +32,9 @@ report.gulpease      # 59
 report.flesch        # 36.92
 report.report        # a formatted summary of statistics and measures
-# accept type 'scientific' to treat list items separated by semicolons as sentences
+# accept custom symbols as delimiters
 text = "Lista:\n- Gennaio;\n- Febbraio;"
-report = Lingua::IT::Readability.new(text)
+report = Lingua::IT::Readability.new(text, ':', '-')
 report.num_sentences # 3
 report.num_words     # 3
 report.num_syllables # 8

data/Rakefile CHANGED Viewed

@@ -1,5 +1,3 @@
-require "rubygems"
-require "rake"
 require "bundler/gem_tasks"
 require "rspec/core/rake_task"

data/lib/lingua/it/readability/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Lingua
   module It
     module Readability
-      VERSION = "1.0.5"
+      VERSION = "1.1.0"
     end
   end
 end

data/lib/lingua/it/readability.rb CHANGED Viewed

@@ -10,19 +10,25 @@ module Lingua
   module IT
     class Readability
       attr_reader :text
-      attr_reader :text
-      attr_reader :type
+      attr_reader :sent
       attr_reader :paragraphs
       attr_reader :sentences
       attr_reader :words
       attr_reader :frequencies
       # Initialize the sample with +text+
-      def initialize(text = '', type = 'standard')
+      def initialize(text = '', *delimiters)
+        @sent = Lingua::IT::Sentence
+        if(!delimiters.empty?)
+          @sent.delimiter(delimiters)
+        else
+          @sent.reset_delimiter
+        end
         @text                = text.dup
-        @type                = type
         @paragraphs          = Lingua::IT::Paragraph.paragraphs(self.text)
-        @sentences           = Lingua::IT::Sentence.sentences(self.text, self.type)
+        @sentences           = @sent.sentences(self.text)
         @words               = []
         @frequencies         = {}
         @frequencies.default = 0
@@ -30,6 +36,11 @@ module Lingua
         count_words
       end
+      # Reset Lingua::IT::Sentence symbols delimiter cache
+      def reset_delimiter
+        @sent.reset_delimiter
+      end
       # The number of paragraphs in the sample. A paragraph is defined as a
       # newline followed by one or more empty or whitespace-only lines.
       def num_paragraphs

data/lib/lingua/it/sentence.rb CHANGED Viewed

@@ -8,6 +8,8 @@ module Lingua
       class << self
         attr_reader :abbreviations
         attr_reader :abbr_regex
+        attr_reader :delimiters
+        attr_reader :delim_regex
       end
       # Common abbreviations
@@ -16,22 +18,18 @@ module Lingua
       MONTHS = %w(gen feb mar apr mag giu lug ago set sett ott nov dic) unless defined?(MONTHS)
       DAYS   = %w(lun mar mer gio ven sab dom) unless defined?(DAYS)
-      # Text types
-      TYPES = {
-        'standard'   => /["']?[A-Z][^.?!]+((?![.?!]['"]?\s["']?[A-Z][^.?!]).)+[.?!'"]+/,
-        'scientific' => /["']?[A-Z][^.;:?!]+((?![.;:?!]['"]?\s["']?[A-Z][^.;:?!]).)+[.;:?!'"]+/
-      }
-      TYPES.default_proc = proc { |hash, key| hash[key] = /["']?[A-Z][^.?!]+((?![.?!]['"]?\s["']?[A-Z][^.?!]).)+[.?!'"]+/ }
+      # Standard delimiters
+      STD = %w(. ? !)
       # Split up in sentences, use 0002 as a temporary end mark for
       # the abbreviations found, even if the regex should be enough
       # to recognize real stop point from abbreviations ones.
       # A sentences should definetly end marked only by a . or a ?
       # or a !
-      def self.sentences(text, type = 'standard')
+      def self.sentences(text)
         txt = text.dup
         txt.gsub!(/\b(#{@abbr_regex})(\.)\B/i, '\10002')
-        txt.gsub!(/#{TYPES[type]}/, '\2\001')
+        txt.gsub!(/["']?[A-Z][^#{@delim_regex}]+((?![#{@delim_regex}]['"]?\s["']?[A-Z][^#{@delim_regex}]).)+[#{@delim_regex}'"]+/, '\2\001')
         txt.gsub!(/\b(#{@abbr_regex})(0002)/i, '\1.')
         txt.split(/01/).map { |sentence| sentence.strip }
       end
@@ -44,6 +42,20 @@ module Lingua
         @abbreviations
       end
+      # Add symbols to sentence delimters
+      def self.delimiter(*delimiters)
+        @delimiters += delimiters
+        @delimiters.uniq!
+        set_delim_regex!
+        @delimiters
+      end
+      def self.reset_delimiter
+        @delimiters = STD
+        set_delim_regex!
+        @delimiters
+      end
       private
       # Utility method, chain up all abbreviations constants arrays
       def self.initialize_abbreviations!
@@ -57,8 +69,20 @@ module Lingua
         @abbr_regex = "#{@abbreviations.join('|')}"
       end
-      initialize_abbreviations!
+      # Utility method, chain up all delimiters constants arrays
+      def self.initialize_delimiters!
+        @delimiters = STD
+        set_delim_regex!
+      end
+      # Utility method, join all elements of the delimiters arrays
+      # without a separator, making suitable for a regex.
+      def self.set_delim_regex!
+        @delim_regex = "#{@delimiters.join('\\')}"
+      end
+      initialize_abbreviations!
+      initialize_delimiters!
     end
   end
 end

data/lib/lingua/it/syllable.rb CHANGED Viewed

@@ -24,19 +24,19 @@ module Lingua
         words = text.dup.split(/[^a-zA-Zàèéìòù'0-9]+/)
         hyphenation = ""
         words.each do |word|
-          word.gsub!(/(#{V})(#{S})/i, '\1=iu=t')
-          word.gsub!(/(#{V})(#{Z})/i, '\1=\2')
-          word.gsub!(/(#{X})(#{V})/i, '\1=\2')
-          word.gsub!(/(#{C})(#{V})(#{V})(#{Y})/, '\1\2=\3=\4')
-          word.gsub!(/(#{V})([bcfgptv][lr])/i, '\1=\2')
-          word.gsub!(/(#{V})([cg]h)/i, '\1=\2')
-          word.gsub!(/(#{V})(gn)/i, '\1=\2')
-          word.gsub!(/(#{C})\1/i, '\1=\1')
-          word.gsub!(/(s#{C})/i, '=\1')
-          1 while word.gsub!(/(#{V}*#{C}+#{V}+)(#{C}#{V})/i, '\1=\2')
-          1 while word.gsub!(/(#{V}*#{C}+#{V}+#{C})(#{C})/i, '\1=\2')
-          word.gsub!(/^(#{V}+#{C})(#{C})/i, '\1=\2')
-          word.gsub!(/^(#{V}+)(#{C}#{V})/i, '\1=\2')
+          word.gsub!(/(#{V})(#{S})/io, '\1=iu=t')
+          word.gsub!(/(#{V})(#{Z})/io, '\1=\2')
+          word.gsub!(/(#{X})(#{V})/io, '\1=\2')
+          word.gsub!(/(#{C})(#{V})(#{V})(#{Y})/io, '\1\2=\3=\4')
+          word.gsub!(/(#{V})([bcfgptv][lr])/io, '\1=\2')
+          word.gsub!(/(#{V})([cg]h)/io, '\1=\2')
+          word.gsub!(/(#{V})(gn)/io, '\1=\2')
+          word.gsub!(/(#{C})\1/io, '\1=\1')
+          word.gsub!(/(s#{C})/io, '=\1')
+          1 while word.gsub!(/(#{V}*#{C}+#{V}+)(#{C}#{V})/io, '\1=\2')
+          1 while word.gsub!(/(#{V}*#{C}+#{V}+#{C})(#{C})/io, '\1=\2')
+          word.gsub!(/^(#{V}+#{C})(#{C})/io, '\1=\2')
+          word.gsub!(/^(#{V}+)(#{C}#{V})/io, '\1=\2')
           word.sub!(/^=/, '')
           word.sub!(/=$/, '')
           word.gsub!(/=+/,'=');
@@ -44,8 +44,8 @@ module Lingua
           word.gsub!(/(le)([oa]n)/i, '\1=\2')
           word.gsub!(/(le)([oa])(an)/i, '\1=\2=\3')
           word.gsub!(/(spe)=(le)=(o)/i, '\1=\2\3')
-          word.gsub!(/([gd]i)=(#{V})/i, '\1\2')
-          word.gsub!(/(ni)=(#{V})/i, '\1\2')
+          word.gsub!(/([gd]i)=(#{V})/io, '\1\2')
+          word.gsub!(/(ni)=(#{V})/io, '\1\2')
           word.gsub!(/=(e)=(l)/i, '\1\2')
           hyphenation += "#{word}="
         end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: lingua-it-readability
 version: !ruby/object:Gem::Version
-  version: 1.0.5
+  version: 1.1.0
 platform: ruby
 authors:
 - Andrea Giacomo Baldan