docuvator 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift File.expand_path("../../lib", __FILE__)
4
+ require 'docuvator'
5
+
6
+ docuvator = Docuvator::CLI.new
7
+ ARGV.each do |file|
8
+ docuvator.process(Dir.pwd + File::SEPARATOR + file)
9
+ end
@@ -0,0 +1,44 @@
1
+ require 'docuvator/initialize'
2
+
3
+ module Docuvator
4
+ class CLI
5
+ attr_reader :options
6
+ def initialize
7
+ @options = OpenStruct.new(
8
+ verbose: false,
9
+ debug: false,
10
+ )
11
+ parse_options
12
+ determine_log_level
13
+ end
14
+
15
+ def process(file)
16
+ Log.info "Processing #{file}"
17
+ Splitter.new(file).split
18
+ end
19
+
20
+ def parse_options
21
+ OptionParser.new do |opt|
22
+ opt.version = VERSION
23
+ opt.on "-v", "--verbose", "Verbose output (shows INFO level log statements)" do
24
+ options.verbose = true
25
+ end
26
+ opt.on "-d", "--debug", "Debug output (shows DEBUG level log statements)" do
27
+ options.debug = true
28
+ end
29
+ end.parse!
30
+ end
31
+
32
+ private
33
+
34
+ def determine_log_level
35
+ if options.debug
36
+ Log.level = Logger::DEBUG
37
+ Log.use_debug
38
+ elsif options.verbose
39
+ Log.level = Logger::INFO
40
+ end
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,5 @@
1
+ require 'pathname'
2
+ require 'ostruct'
3
+ require 'optparse'
4
+
5
+ Dir.glob(File.dirname(__FILE__) + '/**/*.rb') { |file| require file }
@@ -0,0 +1,59 @@
1
+ require 'logger'
2
+ require 'singleton'
3
+
4
+ module Docuvator
5
+ class Log
6
+ include Singleton
7
+
8
+ attr_accessor :logger, :base_directory, :debugging
9
+
10
+ def initialize
11
+ @base_directory = File.expand_path("../..", __FILE__) + "/"
12
+ @debugging = false
13
+ @logger = Logger.new(STDOUT)
14
+ @logger.level = Logger::ERROR
15
+ @logger.formatter = proc do |sev, datetime, progname, msg|
16
+ "#{msg}\n"
17
+ end
18
+ end
19
+
20
+ def self.use_debug
21
+ instance.debugging = true
22
+ instance.logger.formatter = proc do |sev, datetime, progname, msg|
23
+ "#{sev} [#{progname}]: #{msg}\n"
24
+ end
25
+ end
26
+
27
+ # Determine the file, method, line number of the caller
28
+ def self.parse_caller(message)
29
+ if /^(?<file>.+?):(?<line>\d+)(?::in `(?<method>.*)')?/ =~ message
30
+ file = Regexp.last_match[:file]
31
+ line = Regexp.last_match[:line]
32
+ method = Regexp.last_match[:method]
33
+ "#{file.sub(instance.base_directory, "")}:#{line}"
34
+ end
35
+ end
36
+
37
+ def self.method_missing(method, *args, &blk)
38
+ if valid_method? method
39
+ instance.logger.progname = parse_caller(caller(1).first) if instance.debugging
40
+ instance.logger.send(method, *args, &blk)
41
+ else
42
+ super
43
+ end
44
+ end
45
+
46
+ def self.respond_to_missing?(method, include_all=false)
47
+ if valid_method? method
48
+ true
49
+ else
50
+ super
51
+ end
52
+ end
53
+
54
+ def self.valid_method?(method)
55
+ instance.logger.respond_to? method
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,71 @@
1
+ module Docuvator
2
+ class Splitter
3
+ attr_accessor :paragraphs, :sentences, :word_frequency
4
+ attr_reader :file
5
+
6
+ # Sentence ignore set
7
+ IGNORE_SET = ['Mr', 'Mrs', 'Ms', 'Jr', 'Dr', 'Prof', 'Sr']
8
+
9
+ REGEX_NEWLINE = Regexp.new(/\n+/)
10
+ REGEX_SPACES = Regexp.new(/\s+/)
11
+ REGEX_PUNCTUATION = Regexp.new(/[.!?,:;\[\]\(\)]+/)
12
+ REGEX_SENTENCES = Regexp.new(/(?<=[.!?]|[.!?][\'"])(?<! #{IGNORE_SET.join('\\.| ') + '\\.|'} \s[A-Z]\.)\s+/ix)
13
+ =begin
14
+ Regex to split up sentences - http://stackoverflow.com/a/5844564/583592
15
+
16
+ (?<= # Begin positive lookbehind.
17
+ [.!?] # Either an end of sentence punct,
18
+ | [.!?][\'"] # or end of sentence punct and quote.
19
+ ) # End positive lookbehind.
20
+ (?<! # Begin negative lookbehind.
21
+ Mr\. # Skip either "Mr."
22
+ | Mrs\. # or "Mrs.",
23
+ | Ms\. # or "Ms.",
24
+ | Jr\. # or "Jr.",
25
+ | Dr\. # or "Dr.",
26
+ | Prof\. # or "Prof.",
27
+ | Sr\. # or "Sr.",
28
+ | \s[A-Z]\. # or initials ex: "George W. Bush",
29
+ ) # End negative lookbehind.
30
+ \s+ # Split on whitespace between sentences.
31
+ /ix
32
+
33
+ =end
34
+
35
+ def initialize(file)
36
+ if File.exist? file
37
+ @file = file
38
+ else
39
+ Log.error "Cannot split #{file} as it does not exist"
40
+ exit 1
41
+ end
42
+ end
43
+
44
+ def split
45
+ # Split up text into paragraphs
46
+ @paragraphs = File.open(@file).read.split(REGEX_NEWLINE).map { |s|
47
+ s = s.gsub(REGEX_SPACES, ' ').strip
48
+ }
49
+ Log.info "Number of paragraphs: #{@paragraphs.size}"
50
+
51
+ # Split up paragraphs into sentences
52
+ @sentences = []
53
+ @paragraphs.each do |paragraph|
54
+ paragraph.split(REGEX_SENTENCES).each do |sentence|
55
+ @sentences << sentence
56
+ end
57
+ end
58
+ Log.info "Number of sentences: #{@sentences.size}"
59
+
60
+ @word_frequency = Hash.new(0)
61
+ @sentences.each do |sentence|
62
+ sentence.split.map { |s|
63
+ s = s.gsub(REGEX_PUNCTUATION, '').downcase
64
+ @word_frequency[s] += 1
65
+ s
66
+ }
67
+ end
68
+ Log.info "Unique words: #{@word_frequency.size}"
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,3 @@
1
+ module Docuvator
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'spec_helper'
2
+
3
+ include Docuvator
4
+
5
+ describe Log do
6
+
7
+ let(:log) { Class.new(Log) }
8
+
9
+ context "initializes instance" do
10
+ it "should acts as singleton" do
11
+ log.instance.should == log.instance
12
+ end
13
+
14
+ it "should have a logger" do
15
+ log.instance.logger.should be_a Logger
16
+ end
17
+
18
+ it "should be a Log (FakeLog)" do
19
+ log.class.should eq Log.class
20
+ end
21
+
22
+ it "should react to Logger methods" do
23
+ Logger.public_instance_methods.each do |method|
24
+ log.valid_method?(method).should be_true
25
+ end
26
+ end
27
+ end
28
+
29
+ context "#use_debug" do
30
+ it "logger's progname before" do
31
+ log.progname.should be_nil
32
+ end
33
+
34
+ it "logger's progname after" do
35
+ log.use_debug
36
+ log.progname.should_not be_nil
37
+ end
38
+ end
39
+
40
+ context "#parse_caller" do
41
+ context "with nothing" do
42
+ it { log.parse_caller(nil).should be_nil }
43
+ end
44
+
45
+ context "with jumble (random text)" do
46
+ it { log.parse_caller("asdaacsdc").should be_nil }
47
+ end
48
+
49
+ context "with valid caller" do
50
+ it { log.parse_caller("docuvator/lib/docuvator.rb:45:in `respond_to_missing?'").should eq "docuvator/lib/docuvator.rb:45" }
51
+ end
52
+ end
53
+
54
+ end
@@ -0,0 +1,47 @@
1
+ require 'spec_helper'
2
+
3
+ include Docuvator
4
+
5
+ describe Splitter do
6
+ let(:splitter) { Splitter.new(file) }
7
+
8
+ context "initializes instance" do
9
+ context "with valid file" do
10
+ let(:file) { fixture('lorem_ipsum.txt') }
11
+ it { splitter.file.should eq file }
12
+ end
13
+ context "with invalid file" do
14
+ let(:file) { fixture('invalid.txt') }
15
+ it { lambda { splitter }.should raise_error SystemExit }
16
+ end
17
+ end
18
+
19
+ describe "#split" do
20
+ context "valid file" do
21
+ let(:file) { fixture('lorem_ipsum.txt') }
22
+ let(:splitter) {
23
+ splitter = Splitter.new(file)
24
+ splitter.split
25
+ splitter
26
+ }
27
+
28
+ it { splitter.paragraphs.size.should eq 3 }
29
+ it { splitter.sentences.size.should eq 39 }
30
+ it { splitter.word_frequency.size.should eq 124 }
31
+ end
32
+
33
+ context "valid file (idempotence)" do
34
+ let(:file) { fixture('lorem_ipsum.txt') }
35
+ let(:splitter) {
36
+ splitter = Splitter.new(file)
37
+ splitter.split
38
+ splitter.split
39
+ splitter
40
+ }
41
+
42
+ it { splitter.paragraphs.size.should eq 3 }
43
+ it { splitter.sentences.size.should eq 39 }
44
+ it { splitter.word_frequency.size.should eq 124 }
45
+ end
46
+ end
47
+ end
metadata ADDED
@@ -0,0 +1,56 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: docuvator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Kevin Jalbert
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-07-13 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Gem which extracts document statistics and information
15
+ email:
16
+ - kevin.j.jalbert@gmail.com
17
+ executables:
18
+ - docuvator
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/docuvator/initialize.rb
23
+ - lib/docuvator/log.rb
24
+ - lib/docuvator/splitter.rb
25
+ - lib/docuvator/version.rb
26
+ - lib/docuvator.rb
27
+ - spec/log_spec.rb
28
+ - spec/splitter_spec.rb
29
+ - bin/docuvator
30
+ homepage: https://github.com/kevinjalbert/docuvator
31
+ licenses: []
32
+ post_install_message:
33
+ rdoc_options: []
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ required_rubygems_version: !ruby/object:Gem::Requirement
43
+ none: false
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirements: []
49
+ rubyforge_project:
50
+ rubygems_version: 1.8.25
51
+ signing_key:
52
+ specification_version: 3
53
+ summary: Gem which extracts document statistics and information
54
+ test_files:
55
+ - spec/log_spec.rb
56
+ - spec/splitter_spec.rb