docuvator 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/docuvator +9 -0
- data/lib/docuvator.rb +44 -0
- data/lib/docuvator/initialize.rb +5 -0
- data/lib/docuvator/log.rb +59 -0
- data/lib/docuvator/splitter.rb +71 -0
- data/lib/docuvator/version.rb +3 -0
- data/spec/log_spec.rb +54 -0
- data/spec/splitter_spec.rb +47 -0
- metadata +56 -0
data/bin/docuvator
ADDED
data/lib/docuvator.rb
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
require 'docuvator/initialize'
|
|
2
|
+
|
|
3
|
+
module Docuvator
|
|
4
|
+
class CLI
|
|
5
|
+
attr_reader :options
|
|
6
|
+
def initialize
|
|
7
|
+
@options = OpenStruct.new(
|
|
8
|
+
verbose: false,
|
|
9
|
+
debug: false,
|
|
10
|
+
)
|
|
11
|
+
parse_options
|
|
12
|
+
determine_log_level
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def process(file)
|
|
16
|
+
Log.info "Processing #{file}"
|
|
17
|
+
Splitter.new(file).split
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def parse_options
|
|
21
|
+
OptionParser.new do |opt|
|
|
22
|
+
opt.version = VERSION
|
|
23
|
+
opt.on "-v", "--verbose", "Verbose output (shows INFO level log statements)" do
|
|
24
|
+
options.verbose = true
|
|
25
|
+
end
|
|
26
|
+
opt.on "-d", "--debug", "Debug output (shows DEBUG level log statements)" do
|
|
27
|
+
options.debug = true
|
|
28
|
+
end
|
|
29
|
+
end.parse!
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def determine_log_level
|
|
35
|
+
if options.debug
|
|
36
|
+
Log.level = Logger::DEBUG
|
|
37
|
+
Log.use_debug
|
|
38
|
+
elsif options.verbose
|
|
39
|
+
Log.level = Logger::INFO
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
require 'logger'
|
|
2
|
+
require 'singleton'
|
|
3
|
+
|
|
4
|
+
module Docuvator
|
|
5
|
+
class Log
|
|
6
|
+
include Singleton
|
|
7
|
+
|
|
8
|
+
attr_accessor :logger, :base_directory, :debugging
|
|
9
|
+
|
|
10
|
+
def initialize
|
|
11
|
+
@base_directory = File.expand_path("../..", __FILE__) + "/"
|
|
12
|
+
@debugging = false
|
|
13
|
+
@logger = Logger.new(STDOUT)
|
|
14
|
+
@logger.level = Logger::ERROR
|
|
15
|
+
@logger.formatter = proc do |sev, datetime, progname, msg|
|
|
16
|
+
"#{msg}\n"
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def self.use_debug
|
|
21
|
+
instance.debugging = true
|
|
22
|
+
instance.logger.formatter = proc do |sev, datetime, progname, msg|
|
|
23
|
+
"#{sev} [#{progname}]: #{msg}\n"
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Determine the file, method, line number of the caller
|
|
28
|
+
def self.parse_caller(message)
|
|
29
|
+
if /^(?<file>.+?):(?<line>\d+)(?::in `(?<method>.*)')?/ =~ message
|
|
30
|
+
file = Regexp.last_match[:file]
|
|
31
|
+
line = Regexp.last_match[:line]
|
|
32
|
+
method = Regexp.last_match[:method]
|
|
33
|
+
"#{file.sub(instance.base_directory, "")}:#{line}"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def self.method_missing(method, *args, &blk)
|
|
38
|
+
if valid_method? method
|
|
39
|
+
instance.logger.progname = parse_caller(caller(1).first) if instance.debugging
|
|
40
|
+
instance.logger.send(method, *args, &blk)
|
|
41
|
+
else
|
|
42
|
+
super
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.respond_to_missing?(method, include_all=false)
|
|
47
|
+
if valid_method? method
|
|
48
|
+
true
|
|
49
|
+
else
|
|
50
|
+
super
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def self.valid_method?(method)
|
|
55
|
+
instance.logger.respond_to? method
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
module Docuvator
|
|
2
|
+
class Splitter
|
|
3
|
+
attr_accessor :paragraphs, :sentences, :word_frequency
|
|
4
|
+
attr_reader :file
|
|
5
|
+
|
|
6
|
+
# Sentence ignore set
|
|
7
|
+
IGNORE_SET = ['Mr', 'Mrs', 'Ms', 'Jr', 'Dr', 'Prof', 'Sr']
|
|
8
|
+
|
|
9
|
+
REGEX_NEWLINE = Regexp.new(/\n+/)
|
|
10
|
+
REGEX_SPACES = Regexp.new(/\s+/)
|
|
11
|
+
REGEX_PUNCTUATION = Regexp.new(/[.!?,:;\[\]\(\)]+/)
|
|
12
|
+
REGEX_SENTENCES = Regexp.new(/(?<=[.!?]|[.!?][\'"])(?<! #{IGNORE_SET.join('\\.| ') + '\\.|'} \s[A-Z]\.)\s+/ix)
|
|
13
|
+
=begin
|
|
14
|
+
Regex to split up sentences - http://stackoverflow.com/a/5844564/583592
|
|
15
|
+
|
|
16
|
+
(?<= # Begin positive lookbehind.
|
|
17
|
+
[.!?] # Either an end of sentence punct,
|
|
18
|
+
| [.!?][\'"] # or end of sentence punct and quote.
|
|
19
|
+
) # End positive lookbehind.
|
|
20
|
+
(?<! # Begin negative lookbehind.
|
|
21
|
+
Mr\. # Skip either "Mr."
|
|
22
|
+
| Mrs\. # or "Mrs.",
|
|
23
|
+
| Ms\. # or "Ms.",
|
|
24
|
+
| Jr\. # or "Jr.",
|
|
25
|
+
| Dr\. # or "Dr.",
|
|
26
|
+
| Prof\. # or "Prof.",
|
|
27
|
+
| Sr\. # or "Sr.",
|
|
28
|
+
| \s[A-Z]\. # or initials ex: "George W. Bush",
|
|
29
|
+
) # End negative lookbehind.
|
|
30
|
+
\s+ # Split on whitespace between sentences.
|
|
31
|
+
/ix
|
|
32
|
+
|
|
33
|
+
=end
|
|
34
|
+
|
|
35
|
+
def initialize(file)
|
|
36
|
+
if File.exist? file
|
|
37
|
+
@file = file
|
|
38
|
+
else
|
|
39
|
+
Log.error "Cannot split #{file} as it does not exist"
|
|
40
|
+
exit 1
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def split
|
|
45
|
+
# Split up text into paragraphs
|
|
46
|
+
@paragraphs = File.open(@file).read.split(REGEX_NEWLINE).map { |s|
|
|
47
|
+
s = s.gsub(REGEX_SPACES, ' ').strip
|
|
48
|
+
}
|
|
49
|
+
Log.info "Number of paragraphs: #{@paragraphs.size}"
|
|
50
|
+
|
|
51
|
+
# Split up paragraphs into sentences
|
|
52
|
+
@sentences = []
|
|
53
|
+
@paragraphs.each do |paragraph|
|
|
54
|
+
paragraph.split(REGEX_SENTENCES).each do |sentence|
|
|
55
|
+
@sentences << sentence
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
Log.info "Number of sentences: #{@sentences.size}"
|
|
59
|
+
|
|
60
|
+
@word_frequency = Hash.new(0)
|
|
61
|
+
@sentences.each do |sentence|
|
|
62
|
+
sentence.split.map { |s|
|
|
63
|
+
s = s.gsub(REGEX_PUNCTUATION, '').downcase
|
|
64
|
+
@word_frequency[s] += 1
|
|
65
|
+
s
|
|
66
|
+
}
|
|
67
|
+
end
|
|
68
|
+
Log.info "Unique words: #{@word_frequency.size}"
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
data/spec/log_spec.rb
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
include Docuvator
|
|
4
|
+
|
|
5
|
+
describe Log do
|
|
6
|
+
|
|
7
|
+
let(:log) { Class.new(Log) }
|
|
8
|
+
|
|
9
|
+
context "initializes instance" do
|
|
10
|
+
it "should acts as singleton" do
|
|
11
|
+
log.instance.should == log.instance
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
it "should have a logger" do
|
|
15
|
+
log.instance.logger.should be_a Logger
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it "should be a Log (FakeLog)" do
|
|
19
|
+
log.class.should eq Log.class
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it "should react to Logger methods" do
|
|
23
|
+
Logger.public_instance_methods.each do |method|
|
|
24
|
+
log.valid_method?(method).should be_true
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
context "#use_debug" do
|
|
30
|
+
it "logger's progname before" do
|
|
31
|
+
log.progname.should be_nil
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it "logger's progname after" do
|
|
35
|
+
log.use_debug
|
|
36
|
+
log.progname.should_not be_nil
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
context "#parse_caller" do
|
|
41
|
+
context "with nothing" do
|
|
42
|
+
it { log.parse_caller(nil).should be_nil }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
context "with jumble (random text)" do
|
|
46
|
+
it { log.parse_caller("asdaacsdc").should be_nil }
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
context "with valid caller" do
|
|
50
|
+
it { log.parse_caller("docuvator/lib/docuvator.rb:45:in `respond_to_missing?'").should eq "docuvator/lib/docuvator.rb:45" }
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
include Docuvator
|
|
4
|
+
|
|
5
|
+
describe Splitter do
|
|
6
|
+
let(:splitter) { Splitter.new(file) }
|
|
7
|
+
|
|
8
|
+
context "initializes instance" do
|
|
9
|
+
context "with valid file" do
|
|
10
|
+
let(:file) { fixture('lorem_ipsum.txt') }
|
|
11
|
+
it { splitter.file.should eq file }
|
|
12
|
+
end
|
|
13
|
+
context "with invalid file" do
|
|
14
|
+
let(:file) { fixture('invalid.txt') }
|
|
15
|
+
it { lambda { splitter }.should raise_error SystemExit }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
describe "#split" do
|
|
20
|
+
context "valid file" do
|
|
21
|
+
let(:file) { fixture('lorem_ipsum.txt') }
|
|
22
|
+
let(:splitter) {
|
|
23
|
+
splitter = Splitter.new(file)
|
|
24
|
+
splitter.split
|
|
25
|
+
splitter
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
it { splitter.paragraphs.size.should eq 3 }
|
|
29
|
+
it { splitter.sentences.size.should eq 39 }
|
|
30
|
+
it { splitter.word_frequency.size.should eq 124 }
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
context "valid file (idempotence)" do
|
|
34
|
+
let(:file) { fixture('lorem_ipsum.txt') }
|
|
35
|
+
let(:splitter) {
|
|
36
|
+
splitter = Splitter.new(file)
|
|
37
|
+
splitter.split
|
|
38
|
+
splitter.split
|
|
39
|
+
splitter
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
it { splitter.paragraphs.size.should eq 3 }
|
|
43
|
+
it { splitter.sentences.size.should eq 39 }
|
|
44
|
+
it { splitter.word_frequency.size.should eq 124 }
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: docuvator
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
prerelease:
|
|
6
|
+
platform: ruby
|
|
7
|
+
authors:
|
|
8
|
+
- Kevin Jalbert
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
date: 2013-07-13 00:00:00.000000000 Z
|
|
13
|
+
dependencies: []
|
|
14
|
+
description: Gem which extracts document statistics and information
|
|
15
|
+
email:
|
|
16
|
+
- kevin.j.jalbert@gmail.com
|
|
17
|
+
executables:
|
|
18
|
+
- docuvator
|
|
19
|
+
extensions: []
|
|
20
|
+
extra_rdoc_files: []
|
|
21
|
+
files:
|
|
22
|
+
- lib/docuvator/initialize.rb
|
|
23
|
+
- lib/docuvator/log.rb
|
|
24
|
+
- lib/docuvator/splitter.rb
|
|
25
|
+
- lib/docuvator/version.rb
|
|
26
|
+
- lib/docuvator.rb
|
|
27
|
+
- spec/log_spec.rb
|
|
28
|
+
- spec/splitter_spec.rb
|
|
29
|
+
- bin/docuvator
|
|
30
|
+
homepage: https://github.com/kevinjalbert/docuvator
|
|
31
|
+
licenses: []
|
|
32
|
+
post_install_message:
|
|
33
|
+
rdoc_options: []
|
|
34
|
+
require_paths:
|
|
35
|
+
- lib
|
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
37
|
+
none: false
|
|
38
|
+
requirements:
|
|
39
|
+
- - ! '>='
|
|
40
|
+
- !ruby/object:Gem::Version
|
|
41
|
+
version: '0'
|
|
42
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
43
|
+
none: false
|
|
44
|
+
requirements:
|
|
45
|
+
- - ! '>='
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
requirements: []
|
|
49
|
+
rubyforge_project:
|
|
50
|
+
rubygems_version: 1.8.25
|
|
51
|
+
signing_key:
|
|
52
|
+
specification_version: 3
|
|
53
|
+
summary: Gem which extracts document statistics and information
|
|
54
|
+
test_files:
|
|
55
|
+
- spec/log_spec.rb
|
|
56
|
+
- spec/splitter_spec.rb
|