docuvator 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift File.expand_path("../../lib", __FILE__)
4
+ require 'docuvator'
5
+
6
+ docuvator = Docuvator::CLI.new
7
+ ARGV.each do |file|
8
+ docuvator.process(Dir.pwd + File::SEPARATOR + file)
9
+ end
@@ -0,0 +1,44 @@
1
+ require 'docuvator/initialize'
2
+
3
+ module Docuvator
4
+ class CLI
5
+ attr_reader :options
6
+ def initialize
7
+ @options = OpenStruct.new(
8
+ verbose: false,
9
+ debug: false,
10
+ )
11
+ parse_options
12
+ determine_log_level
13
+ end
14
+
15
+ def process(file)
16
+ Log.info "Processing #{file}"
17
+ Splitter.new(file).split
18
+ end
19
+
20
+ def parse_options
21
+ OptionParser.new do |opt|
22
+ opt.version = VERSION
23
+ opt.on "-v", "--verbose", "Verbose output (shows INFO level log statements)" do
24
+ options.verbose = true
25
+ end
26
+ opt.on "-d", "--debug", "Debug output (shows DEBUG level log statements)" do
27
+ options.debug = true
28
+ end
29
+ end.parse!
30
+ end
31
+
32
+ private
33
+
34
+ def determine_log_level
35
+ if options.debug
36
+ Log.level = Logger::DEBUG
37
+ Log.use_debug
38
+ elsif options.verbose
39
+ Log.level = Logger::INFO
40
+ end
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,5 @@
1
+ require 'pathname'
2
+ require 'ostruct'
3
+ require 'optparse'
4
+
5
+ Dir.glob(File.dirname(__FILE__) + '/**/*.rb') { |file| require file }
@@ -0,0 +1,59 @@
1
+ require 'logger'
2
+ require 'singleton'
3
+
4
+ module Docuvator
5
+ class Log
6
+ include Singleton
7
+
8
+ attr_accessor :logger, :base_directory, :debugging
9
+
10
+ def initialize
11
+ @base_directory = File.expand_path("../..", __FILE__) + "/"
12
+ @debugging = false
13
+ @logger = Logger.new(STDOUT)
14
+ @logger.level = Logger::ERROR
15
+ @logger.formatter = proc do |sev, datetime, progname, msg|
16
+ "#{msg}\n"
17
+ end
18
+ end
19
+
20
+ def self.use_debug
21
+ instance.debugging = true
22
+ instance.logger.formatter = proc do |sev, datetime, progname, msg|
23
+ "#{sev} [#{progname}]: #{msg}\n"
24
+ end
25
+ end
26
+
27
+ # Determine the file, method, line number of the caller
28
+ def self.parse_caller(message)
29
+ if /^(?<file>.+?):(?<line>\d+)(?::in `(?<method>.*)')?/ =~ message
30
+ file = Regexp.last_match[:file]
31
+ line = Regexp.last_match[:line]
32
+ method = Regexp.last_match[:method]
33
+ "#{file.sub(instance.base_directory, "")}:#{line}"
34
+ end
35
+ end
36
+
37
+ def self.method_missing(method, *args, &blk)
38
+ if valid_method? method
39
+ instance.logger.progname = parse_caller(caller(1).first) if instance.debugging
40
+ instance.logger.send(method, *args, &blk)
41
+ else
42
+ super
43
+ end
44
+ end
45
+
46
+ def self.respond_to_missing?(method, include_all=false)
47
+ if valid_method? method
48
+ true
49
+ else
50
+ super
51
+ end
52
+ end
53
+
54
+ def self.valid_method?(method)
55
+ instance.logger.respond_to? method
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,71 @@
1
+ module Docuvator
2
+ class Splitter
3
+ attr_accessor :paragraphs, :sentences, :word_frequency
4
+ attr_reader :file
5
+
6
+ # Sentence ignore set
7
+ IGNORE_SET = ['Mr', 'Mrs', 'Ms', 'Jr', 'Dr', 'Prof', 'Sr']
8
+
9
+ REGEX_NEWLINE = Regexp.new(/\n+/)
10
+ REGEX_SPACES = Regexp.new(/\s+/)
11
+ REGEX_PUNCTUATION = Regexp.new(/[.!?,:;\[\]\(\)]+/)
12
+ REGEX_SENTENCES = Regexp.new(/(?<=[.!?]|[.!?][\'"])(?<! #{IGNORE_SET.join('\\.| ') + '\\.|'} \s[A-Z]\.)\s+/ix)
13
+ =begin
14
+ Regex to split up sentences - http://stackoverflow.com/a/5844564/583592
15
+
16
+ (?<= # Begin positive lookbehind.
17
+ [.!?] # Either an end of sentence punct,
18
+ | [.!?][\'"] # or end of sentence punct and quote.
19
+ ) # End positive lookbehind.
20
+ (?<! # Begin negative lookbehind.
21
+ Mr\. # Skip either "Mr."
22
+ | Mrs\. # or "Mrs.",
23
+ | Ms\. # or "Ms.",
24
+ | Jr\. # or "Jr.",
25
+ | Dr\. # or "Dr.",
26
+ | Prof\. # or "Prof.",
27
+ | Sr\. # or "Sr.",
28
+ | \s[A-Z]\. # or initials ex: "George W. Bush",
29
+ ) # End negative lookbehind.
30
+ \s+ # Split on whitespace between sentences.
31
+ /ix
32
+
33
+ =end
34
+
35
+ def initialize(file)
36
+ if File.exist? file
37
+ @file = file
38
+ else
39
+ Log.error "Cannot split #{file} as it does not exist"
40
+ exit 1
41
+ end
42
+ end
43
+
44
+ def split
45
+ # Split up text into paragraphs
46
+ @paragraphs = File.open(@file).read.split(REGEX_NEWLINE).map { |s|
47
+ s = s.gsub(REGEX_SPACES, ' ').strip
48
+ }
49
+ Log.info "Number of paragraphs: #{@paragraphs.size}"
50
+
51
+ # Split up paragraphs into sentences
52
+ @sentences = []
53
+ @paragraphs.each do |paragraph|
54
+ paragraph.split(REGEX_SENTENCES).each do |sentence|
55
+ @sentences << sentence
56
+ end
57
+ end
58
+ Log.info "Number of sentences: #{@sentences.size}"
59
+
60
+ @word_frequency = Hash.new(0)
61
+ @sentences.each do |sentence|
62
+ sentence.split.map { |s|
63
+ s = s.gsub(REGEX_PUNCTUATION, '').downcase
64
+ @word_frequency[s] += 1
65
+ s
66
+ }
67
+ end
68
+ Log.info "Unique words: #{@word_frequency.size}"
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,3 @@
1
+ module Docuvator
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'spec_helper'
2
+
3
+ include Docuvator
4
+
5
+ describe Log do
6
+
7
+ let(:log) { Class.new(Log) }
8
+
9
+ context "initializes instance" do
10
+ it "should acts as singleton" do
11
+ log.instance.should == log.instance
12
+ end
13
+
14
+ it "should have a logger" do
15
+ log.instance.logger.should be_a Logger
16
+ end
17
+
18
+ it "should be a Log (FakeLog)" do
19
+ log.class.should eq Log.class
20
+ end
21
+
22
+ it "should react to Logger methods" do
23
+ Logger.public_instance_methods.each do |method|
24
+ log.valid_method?(method).should be_true
25
+ end
26
+ end
27
+ end
28
+
29
+ context "#use_debug" do
30
+ it "logger's progname before" do
31
+ log.progname.should be_nil
32
+ end
33
+
34
+ it "logger's progname after" do
35
+ log.use_debug
36
+ log.progname.should_not be_nil
37
+ end
38
+ end
39
+
40
+ context "#parse_caller" do
41
+ context "with nothing" do
42
+ it { log.parse_caller(nil).should be_nil }
43
+ end
44
+
45
+ context "with jumble (random text)" do
46
+ it { log.parse_caller("asdaacsdc").should be_nil }
47
+ end
48
+
49
+ context "with valid caller" do
50
+ it { log.parse_caller("docuvator/lib/docuvator.rb:45:in `respond_to_missing?'").should eq "docuvator/lib/docuvator.rb:45" }
51
+ end
52
+ end
53
+
54
+ end
@@ -0,0 +1,47 @@
1
+ require 'spec_helper'
2
+
3
+ include Docuvator
4
+
5
+ describe Splitter do
6
+ let(:splitter) { Splitter.new(file) }
7
+
8
+ context "initializes instance" do
9
+ context "with valid file" do
10
+ let(:file) { fixture('lorem_ipsum.txt') }
11
+ it { splitter.file.should eq file }
12
+ end
13
+ context "with invalid file" do
14
+ let(:file) { fixture('invalid.txt') }
15
+ it { lambda { splitter }.should raise_error SystemExit }
16
+ end
17
+ end
18
+
19
+ describe "#split" do
20
+ context "valid file" do
21
+ let(:file) { fixture('lorem_ipsum.txt') }
22
+ let(:splitter) {
23
+ splitter = Splitter.new(file)
24
+ splitter.split
25
+ splitter
26
+ }
27
+
28
+ it { splitter.paragraphs.size.should eq 3 }
29
+ it { splitter.sentences.size.should eq 39 }
30
+ it { splitter.word_frequency.size.should eq 124 }
31
+ end
32
+
33
+ context "valid file (idempotence)" do
34
+ let(:file) { fixture('lorem_ipsum.txt') }
35
+ let(:splitter) {
36
+ splitter = Splitter.new(file)
37
+ splitter.split
38
+ splitter.split
39
+ splitter
40
+ }
41
+
42
+ it { splitter.paragraphs.size.should eq 3 }
43
+ it { splitter.sentences.size.should eq 39 }
44
+ it { splitter.word_frequency.size.should eq 124 }
45
+ end
46
+ end
47
+ end
metadata ADDED
@@ -0,0 +1,56 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: docuvator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Kevin Jalbert
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-07-13 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Gem which extracts document statistics and information
15
+ email:
16
+ - kevin.j.jalbert@gmail.com
17
+ executables:
18
+ - docuvator
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/docuvator/initialize.rb
23
+ - lib/docuvator/log.rb
24
+ - lib/docuvator/splitter.rb
25
+ - lib/docuvator/version.rb
26
+ - lib/docuvator.rb
27
+ - spec/log_spec.rb
28
+ - spec/splitter_spec.rb
29
+ - bin/docuvator
30
+ homepage: https://github.com/kevinjalbert/docuvator
31
+ licenses: []
32
+ post_install_message:
33
+ rdoc_options: []
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ required_rubygems_version: !ruby/object:Gem::Requirement
43
+ none: false
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirements: []
49
+ rubyforge_project:
50
+ rubygems_version: 1.8.25
51
+ signing_key:
52
+ specification_version: 3
53
+ summary: Gem which extracts document statistics and information
54
+ test_files:
55
+ - spec/log_spec.rb
56
+ - spec/splitter_spec.rb