lingua-it-readability 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ca7c5b85336a54accf315881255133c101aef7cd
4
+ data.tar.gz: d1ffbc35b45fa73951cddad360cfc1b696ce8ea2
5
+ SHA512:
6
+ metadata.gz: ccc18e702fd8487276d79542117377c436cdd5b7b4d46f108e583290430051f1750808d424f40d9217ba6173d3bf0f0d08f31d6ce47833d15435c1e77176d763
7
+ data.tar.gz: 59d73c1511929d8bb9ce5a9fe38017e717025612e868e8415d0d6ab9c49e0fcae0d5f35d4a61444d034208b03cf045f274b744af978d7b53808024688b8c4444
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.0
4
+ before_install: gem install bundler -v 1.11.2
data/CHANGELOG.md ADDED
@@ -0,0 +1,17 @@
1
+ #### 1.0.0 - 2016-02-09
2
+ ###### Added
3
+ - Some more tests.
4
+
5
+ #### 0.6.0 - 2016-02-08
6
+ ###### Added
7
+ - Types of text.
8
+ - Some more tests.
9
+
10
+ #### 0.5.0 - 2016-02-05
11
+ ###### Added
12
+ - Initial release.
13
+ - Sentences recognition
14
+ - Italian abbreviations
15
+ - Syllables recongnition
16
+ - Gulpease readability index
17
+ - Italian Flesch readability index
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in lingua-it-readability.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Andrea Giacomo Baldan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,44 @@
1
+ [![Build Status](https://travis-ci.org/codepr/lingua.svg?branch=master)](https://travis-ci.org/codepr/lingua)
2
+
3
+ # Lingua::It::Readability
4
+
5
+ Inpired by Lingua::EN::Readability and his perl original version Lingua::EN::Fathom, a gem focused on readability of Italian language texts.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'lingua-it-readability'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install lingua-it-readability
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Changelog
34
+
35
+ See the [CHANGELOG](CHANGELOG.md) file.
36
+
37
+ ## Contributing
38
+
39
+ Bug reports and pull requests are welcome on GitHub at https://github.com/codepr/lingua-it-readability.
40
+
41
+
42
+ ## License
43
+
44
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "lingua/it/readability"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,13 @@
1
+ module Lingua
2
+ module IT
3
+ module Paragraph
4
+
5
+ # Split the sample in paragraph. A paragraph is defined by
6
+ # a sequence of sentences followed by one or more \n, \r\t
7
+ # if in Windows env.
8
+ def self.paragraphs(text)
9
+ text.dup.split(/(?:\n[\r\t ]*)+/).collect { |p| p.strip }
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,7 @@
1
+ module Lingua
2
+ module It
3
+ module Readability
4
+ VERSION = "1.0.0"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,140 @@
1
+ # coding: utf-8
2
+ prefix = File.dirname(__FILE__) + "/"
3
+ $LOAD_PATH.unshift prefix
4
+
5
+ Dir.glob(prefix + "**/*.rb").each do |f|
6
+ require File.expand_path(f)
7
+ end
8
+
9
+ module Lingua
10
+ module IT
11
+ class Readability
12
+ attr_reader :text
13
+ attr_reader :type
14
+ attr_reader :paragraphs
15
+ attr_reader :sentences
16
+ attr_reader :words
17
+ attr_reader :frequencies
18
+
19
+ # Initialize the sample with +text+
20
+ def initialize(text, type = 'standard')
21
+ @text = text.dup
22
+ @type = type
23
+ @paragraphs = Lingua::IT::Paragraph.paragraphs(self.text)
24
+ @sentences = Lingua::IT::Sentence.sentences(self.text, self.type)
25
+ @words = []
26
+ @frequencies = {}
27
+ @frequencies.default = 0
28
+ @syllables = Lingua::IT::Syllable.syllables(self.text)
29
+ count_words
30
+ end
31
+
32
+ # The number of paragraphs in the sample. A paragraph is defined as a
33
+ # newline followed by one or more empty or whitespace-only lines.
34
+ def num_paragraphs
35
+ paragraphs.length
36
+ end
37
+
38
+ # The number of sentences in the sample. The meaning of a "sentence" is
39
+ # defined by Lingua::IT::Sentence.
40
+ def num_sentences
41
+ @sentences.length
42
+ end
43
+
44
+ # The number of characeters in the sample. A character is defined as a
45
+ # single letter, not taking account of punctuation and spaces
46
+ def num_chars
47
+ @text.dup.gsub(/[[:punct:]][[:space:]]/, '').scan(/[a-zA-Z0-9_Èàòèéìù\(\)\[\]\{\}]/i).length
48
+ end
49
+ alias :num_characters :num_chars
50
+
51
+ # The number of words in the sample. A word is defined as a sequence of
52
+ # characters, not taking account of punctuation and spaces, see private
53
+ # method +count_words+ for additional info about a word definition
54
+ def num_words
55
+ words.length
56
+ end
57
+
58
+ # The total number of syllables in the text sample. Syllables are defined
59
+ # in Lingua::IT::Syllable.
60
+ def num_syllables
61
+ @syllables.length
62
+ end
63
+
64
+ # The number of different unique words used in the text sample.
65
+ def num_unique_words
66
+ @frequencies.keys.length
67
+ end
68
+
69
+ # An array containing each unique word used in the text sample.
70
+ def unique_words
71
+ @frequencies.keys
72
+ end
73
+
74
+ # The number of occurences of the word +word+ in the text sample.
75
+ def occurrences(word)
76
+ @frequencies[word]
77
+ end
78
+
79
+ # The average number of words per sentence.
80
+ def words_per_sentence
81
+ ((words.length.to_f / sentences.length.to_f) * 100).round / 100.0
82
+ end
83
+
84
+ # The average number of syllables per word. The syllable count is
85
+ # performed by Lingua::IT::Syllable, and so may not be completely
86
+ # accurate
87
+ def syllables_per_word
88
+ ((@syllables.length.to_f / words.length.to_f) * 100).round / 100.0
89
+ end
90
+
91
+ # Gulpease index of readability expressly calibrated to suit italian
92
+ # text samples.
93
+ # An index < 40 means a low readable sample, between 40 and 60 it
94
+ # represents a medium readable sample, over 60 a well written sample
95
+ # easily readable by an under 16 person.
96
+ def gulpease
97
+ 89 + (((300 * num_sentences) - (10 * num_chars)) / num_words)
98
+ end
99
+
100
+ # Flesch index of readability expressly calibrated to suit italian
101
+ # text samples, derived from U.S. Flesch index.
102
+ # An index < 40 means a low readable sample, between 40 and 60 it
103
+ # represents a medium readable sample, over 60 a well written sample
104
+ # easily readable by an under 16 person.
105
+ def flesch
106
+ ((206.0 - (65.0 * (num_syllables.to_f / num_words.to_f)) -
107
+ ((num_words.to_f / num_sentences.to_f))) * 100).round / 100.0
108
+ end
109
+
110
+ # A nicely formatted report on the sample, showing most the useful
111
+ # stats
112
+ def report
113
+ sprintf "Number of paragraphs %d \n" <<
114
+ "Number of sentences %d \n" <<
115
+ "Number of words %d \n" <<
116
+ "Number of characters %d \n\n" <<
117
+ "Average words per sentence %.2f \n" <<
118
+ "Average syllables per word %.2f \n\n" <<
119
+ "Gulpease score %2.2f \n" <<
120
+ "Flesch score %2.2f \n",
121
+ num_paragraphs, num_sentences, num_words, num_characters,
122
+ words_per_sentence, syllables_per_word, gulpease,
123
+ flesch
124
+ end
125
+
126
+ private
127
+
128
+ # Nnumber of words in the sample. A words is represented by a sequence
129
+ # of single characters exlucding punctuation, except for all kind of
130
+ # parenthesis like () [] and {}. Being calibrated for italian language
131
+ # it takes in account even accented characters.
132
+ def count_words
133
+ @words = @text.dup.gsub(/[^\wÈèòàù\(\)\[\]\{\}]/i, ' ').strip.split(/\s+/)
134
+ @words.each do |word|
135
+ @frequencies[word] += 1
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,64 @@
1
+ module Lingua
2
+ module IT
3
+ class Sentence
4
+
5
+ # Takes Italian text and split it into sentences, respecting
6
+ # generale abbreviations. It grant permission of adding more
7
+ # abbreviations to take in account during the process.
8
+ class << self
9
+ attr_reader :abbreviations
10
+ attr_reader :abbr_regex
11
+ end
12
+
13
+ # Common abbreviations
14
+ TITLES = %w(sig sigg dott preg prof mr jr amn avv co stim dr egr geom ing mons on rag rev soc spett card ill gent cav) unless defined?(TITLES)
15
+ MISC = %w(p v femm dim ecc etc corr cc bcc all es fatt g gg id int lett ogg pag pagg cap pp tel ind v n num min sec ms abbr agg art aus) unless defined?(MISC)
16
+ MONTHS = %w(gen feb mar apr mag giu lug ago set sett ott nov dic) unless defined?(MONTHS)
17
+ DAYS = %w(lun mar mer gio ven sab dom) unless defined?(DAYS)
18
+
19
+ # Text types
20
+ TYPES = {
21
+ 'standard' => /["']?[A-Z][^.?!]+((?![.?!]['"]?\s["']?[A-Z][^.?!]).)+[.?!'"]+/,
22
+ 'scientific' => /["']?[A-Z][^.;:?!]+((?![.;:?!]['"]?\s["']?[A-Z][^.;:?!]).)+[.;:?!'"]+/
23
+ }
24
+ TYPES.default_proc = proc { |hash, key| hash[key] = /["']?[A-Z][^.?!]+((?![.?!]['"]?\s["']?[A-Z][^.?!]).)+[.?!'"]+/ }
25
+
26
+ # Split up in sentences, use 0002 as a temporary end mark for
27
+ # the abbreviations found, even if the regex should be enough
28
+ # to recognize real stop point from abbreviations ones.
29
+ # A sentences should definetly end marked only by a . or a ?
30
+ # or a !
31
+ def self.sentences(text, type = 'standard')
32
+ txt = text.dup
33
+ txt.gsub!(/\b(#{@abbr_regex})(\.)\B/i, '\10002')
34
+ txt.gsub!(/#{TYPES[type]}/, '\2\001')
35
+ txt.gsub!(/\b(#{@abbr_regex})(0002)/i, '\1.')
36
+ txt.split(/01/).map { |sentence| sentence.strip }
37
+ end
38
+
39
+ # Add customized abbreviations to standard set
40
+ def self.abbreviation(*abbreviations)
41
+ @abbreviations += abbreviations
42
+ @abbreviations.uniq!
43
+ set_abbr_regex!
44
+ @abbreviations
45
+ end
46
+
47
+ private
48
+ # Utility method, chain up all abbreviations constants arrays
49
+ def self.initialize_abbreviations!
50
+ @abbreviations = TITLES + MISC + MONTHS + DAYS
51
+ set_abbr_regex!
52
+ end
53
+
54
+ # Utility method, join all elements of the abbreviations arrays
55
+ # using | as separator, making suitable for a regex.
56
+ def self.set_abbr_regex!
57
+ @abbr_regex = "#{@abbreviations.join('|')}"
58
+ end
59
+
60
+ initialize_abbreviations!
61
+
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,46 @@
1
+ # coding: utf-8
2
+ module Lingua
3
+ module IT
4
+ module Syllable
5
+
6
+ # This module is inspired by the Perl Lingua::IT::Hyphenation module.
7
+ # However, it uses a different (though not larger) set of patterns to
8
+ # compensate for the 'special cases' which arise out of Italian's
9
+ # irregular orthography. A number of extra patterns (particularly for
10
+ # derived word forms) means that this module is somewhat more accurate
11
+ # than the Perl original.
12
+
13
+ V = "[aeiouàèéìòù]"
14
+ C = "[b-df-hj-np-tv-z]"
15
+ S = "iut"
16
+ X = "fi|aci"
17
+ Y = "#{C}e"
18
+ Z = "i[aeo]"
19
+
20
+ def self.syllables(text)
21
+ words = text.dup.split(/[^a-zA-Zàèéìòù'0-9]+/)
22
+ hyphenation = ""
23
+ words.each do |word|
24
+ word.gsub!(/(#{V})(#{S})/i, '\1=iu=t')
25
+ word.gsub!(/(#{V})(#{Z})/i, '\1=\2')
26
+ word.gsub!(/(#{X})(#{V})/i, '\1=\2')
27
+ word.gsub!(/(#{Y})(#{V})/i, '\1=\2')
28
+ word.gsub!(/(#{V})([bcfgptv][lr])/i, '\1=\2')
29
+ word.gsub!(/(#{V})([cg]h)/i, '\1=\2')
30
+ word.gsub!(/(#{V})(gn)/i, '\1=\2')
31
+ word.gsub!(/(#{C})\1/i, '\1=\1')
32
+ word.gsub!(/(s#{C})/i, '=\1')
33
+ 1 while word.gsub!(/(#{V}*#{C}+#{V}+)(#{C}#{V})/i, '\1=\2')
34
+ 1 while word.gsub!(/(#{V}*#{C}+#{V}+#{C})(#{C})/i, '\1=\2')
35
+ word.gsub!(/^(#{V}+#{C})(#{C})/i, '\1=\2')
36
+ word.gsub!(/^(#{V}+)(#{C}#{V})/i, '\1=\2')
37
+ word.sub!(/^=/, '')
38
+ word.sub!(/=$/, '')
39
+ word.gsub!(/=+/,'=');
40
+ hyphenation += "#{word}="
41
+ end
42
+ hyphenation.split('=')
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'lingua/it/readability/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "lingua-it-readability"
8
+ spec.version = Lingua::It::Readability::VERSION
9
+ spec.authors = ["Andrea Giacomo Baldan"]
10
+ spec.email = ["a.g.baldan@gmail.com"]
11
+
12
+ spec.summary = %q{Text readability indexes and stats calibrated on Italian language.}
13
+ spec.description = %q{Text readability indexes and stats calibrated on Italian language. Inspired by Lingua::EN::Readability and the original perl module Lingua::EN::Fathom. Gulpease and Flesch for italian text is calculated.}
14
+ spec.homepage = "https://github.com/codepr/lingua-it-readability"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.11"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+ spec.add_development_dependency "rspec", "~> 3.0"
25
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lingua-it-readability
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrea Giacomo Baldan
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-02-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ description: Text readability indexes and stats calibrated on Italian language. Inspired
56
+ by Lingua::EN::Readability and the original perl module Lingua::EN::Fathom. Gulpease
57
+ and Flesch for italian text is calculated.
58
+ email:
59
+ - a.g.baldan@gmail.com
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files: []
63
+ files:
64
+ - ".gitignore"
65
+ - ".rspec"
66
+ - ".travis.yml"
67
+ - CHANGELOG.md
68
+ - Gemfile
69
+ - LICENSE.txt
70
+ - README.md
71
+ - Rakefile
72
+ - bin/console
73
+ - bin/setup
74
+ - lib/lingua/it/paragraph.rb
75
+ - lib/lingua/it/readability.rb
76
+ - lib/lingua/it/readability/version.rb
77
+ - lib/lingua/it/sentence.rb
78
+ - lib/lingua/it/syllable.rb
79
+ - lingua-it-readability.gemspec
80
+ homepage: https://github.com/codepr/lingua-it-readability
81
+ licenses:
82
+ - MIT
83
+ metadata: {}
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ requirements: []
99
+ rubyforge_project:
100
+ rubygems_version: 2.5.2
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: Text readability indexes and stats calibrated on Italian language.
104
+ test_files: []