rstt 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Binary file
@@ -0,0 +1,8 @@
1
+ .rvmrc
2
+ spec/rstt_rake_spec.rb
3
+
4
+ lib/rstt/tt_settings.rb
5
+
6
+ tmp/temp.html.haml
7
+ todos.task
8
+ lib/rstt/tt_settings.rb
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in rstt.gemspec
4
+ gemspec
5
+
6
+ group :development, :test do
7
+ gem "rspec"
8
+ gem "yard"
9
+ gem "syntax"
10
+ end
11
+
12
+ gem "slop"
13
+ gem "thor"
14
+ gem "activesupport"
15
+ gem "celluloid"
@@ -0,0 +1,44 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rstt (0.0.1)
5
+ activesupport
6
+ celluloid
7
+ slop
8
+ thor
9
+
10
+ GEM
11
+ remote: http://rubygems.org/
12
+ specs:
13
+ activesupport (3.2.1)
14
+ i18n (~> 0.6)
15
+ multi_json (~> 1.0)
16
+ celluloid (0.8.0)
17
+ diff-lcs (1.1.3)
18
+ i18n (0.6.0)
19
+ multi_json (1.0.4)
20
+ rspec (2.8.0)
21
+ rspec-core (~> 2.8.0)
22
+ rspec-expectations (~> 2.8.0)
23
+ rspec-mocks (~> 2.8.0)
24
+ rspec-core (2.8.0)
25
+ rspec-expectations (2.8.0)
26
+ diff-lcs (~> 1.1.2)
27
+ rspec-mocks (2.8.0)
28
+ slop (3.0.4)
29
+ syntax (1.0.0)
30
+ thor (0.14.6)
31
+ yard (0.7.4)
32
+
33
+ PLATFORMS
34
+ ruby
35
+
36
+ DEPENDENCIES
37
+ activesupport
38
+ celluloid
39
+ rspec
40
+ rstt!
41
+ slop
42
+ syntax
43
+ thor
44
+ yard
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 LeFnord
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,55 @@
1
+ TreeTagger for Ruby
2
+ ===================
3
+
4
+ DESCRIPTION
5
+ -----------
6
+
7
+ The Ruby based wrapper for the TreeTagger by Helmut Schmid.
8
+ Check it out if you are interested
9
+ in Natural Language Processing (NLP) and Human Language Technology (HLT).
10
+
11
+ INSTALLATION + REQUIREMENTS
12
+ ---------------------------
13
+
14
+ Before you install the treetagger-ruby package please ensure you have downloaded and installe the [TreeTagger](http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/) itself.
15
+ (And pls, respect his terms of license)
16
+
17
+ gem install rstt
18
+ thor config:init /path/to/your/TreeTagger/
19
+
20
+ or:
21
+
22
+ rstt -i /path/to/your/TreeTagger/
23
+
24
+ USAGE
25
+ -----
26
+
27
+ 1. You have some `class`, where you want to use Rstt ..
28
+ the input is given by: `Rstt.set_input lang: lang, content: content` with default language '`en`';
29
+ accessible languages could be found by `Rstt.language_codes`;
30
+ installed languages are stored in `Rstt::LANGUAGES` (cause it could be different)
31
+
32
+ class Foo
33
+ include Rstt
34
+
35
+ def pos_tagging(lang,content)
36
+ Rstt.set_input lang: lang, content: content
37
+ Rstt.preprocessing
38
+ Rstt.tagging
39
+ processed_ data = Rstt.tagged
40
+ end
41
+ end
42
+
43
+ that's all, the processed data are accessible via `Rstt.tagged`,
44
+ it is an Array, thereby each element self is an Array with following elements
45
+
46
+ 1. the input word itself
47
+ 2. the word class
48
+ 3. the lemma of the input; depends on your input language
49
+
50
+ 2. or via CLI, check usage with `rstt -h`
51
+
52
+ LICENSE
53
+ -------
54
+
55
+ see License.txt
@@ -0,0 +1,14 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core'
4
+ require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:spec) do |spec|
6
+ spec.pattern = FileList['spec/**/*_spec.rb']
7
+ end
8
+
9
+ task :default => :spec
10
+
11
+ require 'yard'
12
+ YARD::Rake::YardocTask.new
13
+
14
+ Dir["lib/tasks/**/*.rake"].sort.each { |ext| load ext }
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "rstt"
4
+ require "slop"
5
+ require "awesome_print"
6
+
7
+ # ToDo: add task for initializing ✓
8
+ # add task to show installed languages ✓
9
+ # add preprocessing task ✓
10
+ # add task for running from terminal ✓
11
+
12
+ opts = Slop.parse do
13
+ banner "Usage:
14
+ rstt [-c /path/to/TreeTagger] | [{-t|-p|-t -p} -l lang {-i 'input text'|-f /input/file}] | [-l]\n
15
+ tagging:
16
+ rstt -t -l <language> -i <\"input text\">
17
+ rstt -t -l <language> -f <path/to/file>
18
+ preprocessing:
19
+ rstt -p -l <language> -i <\"input text\">
20
+ rstt -p -l <language> -f <path/to/file>
21
+ both:
22
+ rstt -t -p -l <language> -i <\"input text\">
23
+ rstt -t -p -l <language> -f <path/to/file>\n"
24
+
25
+ on :h, :help, "print help message" do
26
+ puts help
27
+ end
28
+
29
+ on :c, :configure=, "configure Rstt by setting TreeTagger path and installed languages [ARG path]", optional: false do
30
+ print "initilizing ..\n"
31
+ print `thor config:init #{self[:configure]}`
32
+ end
33
+
34
+
35
+ on :t, :tag, "run tagger", argument: false
36
+ on :p, :preprocess, "clean up html sites, strip non word characters", argument: false
37
+ on :l, :lang=, "language [with ARG input|without ARG show]",argument: :optional, optional: true
38
+ on :f, "input file", optional: true, argument: true
39
+ on :i, "input text", optional: true, argument: true
40
+ end
41
+
42
+ # show installed languages
43
+ if opts.l? && opts[:lang].nil? && !(opts.t? || opts.p?)
44
+ puts "installed languages\n(use languages codes on the left)\n"
45
+ Rstt.installed_language_codes.each do |lang|
46
+ puts lang.first.to_s + ": " + lang.last
47
+ end
48
+ end
49
+
50
+ # input data
51
+ # get input from file
52
+ if opts.f? && !opts[:f].nil?
53
+ @input = ""
54
+ input_file = opts[:f]
55
+ # test if file exists
56
+ if File.exists?(input_file)
57
+ File.open(input_file).each do |line|
58
+ @input += line.strip + " "
59
+ end
60
+ else
61
+ puts "not an regular file"
62
+ end
63
+
64
+ # get STDIN
65
+ elsif opts.i? && !opts[:i].nil?
66
+ @input = opts[:i]
67
+ elsif opts.p? || opts.t?
68
+ puts "you must specify some input"
69
+ end
70
+
71
+ # tagging stage
72
+ if opts.t?
73
+ p @input.length
74
+
75
+ if opts.l? && !opts[:lang].nil?
76
+ lang = opts[:lang]
77
+ Rstt.set_input lang: lang, content: @input
78
+ else
79
+ Rstt.set_input content: @input
80
+ end
81
+
82
+ if opts.p?
83
+ Rstt.preprocessing
84
+ end
85
+
86
+ Rstt.tagging
87
+ puts "input .."
88
+ puts Rstt.content
89
+ puts "\n"
90
+ Rstt.tagged.each do |tag|
91
+ p tag
92
+ end
93
+ end
@@ -0,0 +1,109 @@
1
+ # coding: utf-8
2
+ require "celluloid"
3
+ require "active_support/all"
4
+
5
+ # own dependencies
6
+ require "rstt/version"
7
+ require "rstt/preprocess"
8
+ require "rstt/tt_settings"
9
+
10
+ module Rstt
11
+ # added celluloid for for concurrency
12
+ include Celluloid
13
+ mattr_accessor :lang, :content, :origin, :tagged, :sentences, :tags
14
+
15
+ def self.set_input(input = {lang: "", content: ""})
16
+ if input[:lang]
17
+ @@lang = input[:lang]
18
+ else
19
+ @@lang = "en"
20
+ end
21
+ @@content = input[:content]
22
+ end
23
+
24
+ # tagging stage related methods
25
+ def self.tagging
26
+ bar = `echo #{self.content} | #{TT_HOME}/cmd/#{build_tagging_command}`
27
+ # @@tagged = bar.split("\n").collect{|word| word.split("\t") }
28
+ @@tagged = bar.split("\n").collect do |word|
29
+ metrik = word.split("\t")
30
+ # use singular attribute names
31
+ {word: metrik[0], tag: metrik[1], stem: metrik[2]}
32
+ end
33
+ end
34
+
35
+ def self.get_sentences
36
+ @@sentences = Preprocess.split_sentences(self.content)
37
+ end
38
+
39
+ def self.build_tagging_command
40
+ lang = get_command_language
41
+ if LANGUAGES[lang][:utf8]
42
+ cmd = "tree-tagger-#{lang}-utf8"
43
+ else
44
+ cmd = "tree-tagger-#{lang}"
45
+ end
46
+
47
+ cmd
48
+ end
49
+
50
+ def self.get_command_language
51
+ lang = language_codes[self.lang.to_sym]
52
+
53
+ if lang.nil?
54
+ raise "language not supported"
55
+ elsif LANGUAGES[lang].nil?
56
+ raise "language supported, but not installed"
57
+ end
58
+
59
+ lang
60
+ end
61
+
62
+ def self.preprocessing
63
+ @@origin = @@content
64
+ # its important, that first html tags would be stripped and then non word characters
65
+ Preprocess.strip_html_tags(self.content)
66
+ Preprocess.strip_punctation_and_non_word_caracters(self.content)
67
+ end
68
+
69
+ def self.language_codes
70
+ { bg: "bulgarian",
71
+ nl: "dutch",
72
+ en: "english",
73
+ et: "estonian",
74
+ fr: "french",
75
+ de: "german",
76
+ el: "greek",
77
+ it: "italian",
78
+ la: "latin",
79
+ ru: "russian",
80
+ es: "spanish",
81
+ sw: "swahili"
82
+ }
83
+ end
84
+
85
+ def self.installed_language_codes
86
+ installed = {}
87
+ language_codes.each do |lang|
88
+ installed[lang.first] = lang.last unless LANGUAGES[lang.last].nil?
89
+ end
90
+
91
+ installed
92
+ end
93
+
94
+ # output and processing helpers
95
+ def self.print
96
+ p @@lang
97
+ p @@content
98
+ end
99
+
100
+ # ToDo 2012-02-25: work with `method_missing?`: DRY
101
+ # methods are plural @@tagged keys
102
+ %w(words tags stems).each do |meth|
103
+ self.define_singleton_method(meth) do
104
+ foo = []
105
+ @@tagged.each{ |tag| foo << tag[meth.singularize.to_sym] }
106
+ foo
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ module Rstt
3
+ module Preprocess
4
+ def self.strip_html_tags(content)
5
+ unless content.nil?
6
+ content.gsub!(/\s*<\/?[^>]*>\s*/," ")
7
+ content.gsub!(/&\w+;/i," ")
8
+ end
9
+ end
10
+
11
+ def self.strip_punctation_and_non_word_caracters(content)
12
+ unless content.nil?
13
+ content.gsub!(/\b[\#\%\^\$\@\(\)✗✓=\/"']{1,2}\b/i,'')
14
+ content.gsub!(/\s*[\#\%\^\$\@\(\)✗✓=\/"']{1,2}\s*/i,' ')
15
+ content.gsub!(/(–|--)/,'-')
16
+ content.gsub!(/\s+/,' ')
17
+ content.strip!
18
+ # content.gsub!(/(–|--)/,'-')
19
+ # content.gsub!(/\s+/,' ')
20
+ end
21
+ end
22
+ end # module Preprocess
23
+ end # module Rstt
24
+
25
+
@@ -0,0 +1,3 @@
1
+ module Rstt
2
+ VERSION = "0.9.0"
3
+ end
@@ -0,0 +1,138 @@
1
+ class Config < Thor
2
+ include Thor::Actions
3
+
4
+ desc "init PATH", "add TreeTagger path to module (--force replace old one)"
5
+ method_options :force => :boolean
6
+ def init(path)
7
+ if options.force?
8
+ remove
9
+ end
10
+ case
11
+ # 1. check if path set
12
+ when path
13
+ # a) check, if it is an accessible dir
14
+ if ::File.exists?(path)
15
+ insert(path)
16
+ set_languages
17
+ else
18
+ puts "not an regular directory"
19
+ end
20
+ # 2. is env set?
21
+ when ENV['TREETAGGERHOME']
22
+ path = ENV['TREETAGGERHOME']
23
+ insert(path)
24
+ set_languages
25
+ else
26
+ puts "specify path as argument:\n"
27
+ puts " thor config:init /path/to/your/TreeTagger/installation/"
28
+ end
29
+
30
+ end
31
+
32
+ desc "set_languages", "set installed languages"
33
+ def set_languages
34
+ puts "setting languages .."
35
+ languages = {}
36
+
37
+ lib_path = File.join(get_path, 'lib')
38
+ Dir.entries(lib_path).each do |file|
39
+ foos = file.split("-")
40
+ languages[foos.first] = {utf8: foos.last.include?("utf")} unless foos.first.include?(".")
41
+ end
42
+
43
+ insert_into_file "lib/rstt/tt_settings.rb", after: /^ TT_HOME = (.+)\n/ do
44
+ " LANGUAGES = #{languages}\n"
45
+ end
46
+
47
+ end
48
+
49
+ desc "get_languages", "see possible languages"
50
+ def get_languages
51
+ languages = []
52
+ get_files_of_dir do |file|
53
+ languages << file.split("-")[2] if file.include?("tree-tagger")
54
+ end
55
+
56
+ languages.uniq
57
+ end
58
+
59
+ # desc "get_workflows", "get installed workflows"
60
+ # def get_workflows
61
+ # puts "get workflow commands from command files"
62
+ # tree_tagger_scripts = []
63
+ # get_files_of_dir do |file|
64
+ # tree_tagger_scripts << file
65
+ # end
66
+ #
67
+ # tree_tagger_scripts.each do |file|
68
+ # p "_____________"
69
+ # p file
70
+ # File.open(file).readlines.each do |line|
71
+ # p line
72
+ # end
73
+ # p "\n"
74
+ # end
75
+ # end
76
+
77
+ private
78
+ def insert(path)
79
+ puts "insert path .."
80
+ unless File.exists?("lib/rstt/tt_settings.rb")
81
+ create_file "lib/rstt/tt_settings.rb" do
82
+ "module Rstt
83
+ TT_HOME = \"#{path}\"
84
+ end"
85
+ end
86
+ else
87
+ insert_into_file "lib/rstt/tt_settings.rb", after: "module Rstt\n" do
88
+ " TT_HOME = \"#{path}\"\n"
89
+ end
90
+ end
91
+
92
+ insert_require
93
+ end
94
+
95
+ def remove
96
+ puts "removing old .."
97
+ gsub_file "lib/rstt/tt_settings.rb", /^ TT_HOME = (.+)\n/ do |match|
98
+ match = ""
99
+ end
100
+
101
+ remove_require
102
+ end
103
+
104
+ def get_path
105
+ path = ""
106
+ gsub_file "lib/rstt/tt_settings.rb", /^ TT_HOME = (.+)\n/ do |match|
107
+ path = match
108
+ end
109
+ path.chomp.split(" = ").last.gsub("\"",'')
110
+ end
111
+
112
+ def get_files_of_dir(dir = "cmd", pattern = "tree-tagger*")
113
+ cmd_path = File.join(get_path, dir, pattern)
114
+ Dir.glob(cmd_path).each do |file|
115
+ yield file
116
+ end
117
+ # Dir.glob(tt_files).each do |file|
118
+ # p file
119
+ # yield file
120
+ # end
121
+
122
+ # Dir.entries(cmd_path).each do |file|
123
+ # yield file
124
+ # end
125
+ end
126
+
127
+ def insert_require
128
+ insert_into_file "lib/rstt.rb", after: "require \"rstt/preprocess\"\n" do
129
+ "require \"rstt/tt_settings\"\n"
130
+ end
131
+ end
132
+
133
+ def remove_require
134
+ gsub_file "lib/rstt.rb", "require \"rstt/tt_settings\"" do |match|
135
+ match = ""
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "rstt/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "rstt"
7
+ s.version = Rstt::VERSION
8
+ s.authors = ["LeFnord"]
9
+ s.email = ["pscholz.le@gmail.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{another ruby wrapper for Stuttgarter Tree Tagger}
12
+ s.description = %q{another ruby wrapper for Stuttgarter Tree Tagger}
13
+
14
+ s.rubyforge_project = "rstt"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib","bin"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "rspec"
23
+ s.add_development_dependency "yard"
24
+ s.add_development_dependency "syntax"
25
+ s.add_runtime_dependency "thor"
26
+ s.add_runtime_dependency "activesupport"
27
+ s.add_runtime_dependency "celluloid"
28
+ s.add_runtime_dependency "slop"
29
+ end
@@ -0,0 +1,57 @@
1
+ # coding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe Rstt do
5
+ describe "Preprocessing" do
6
+ before(:each) do
7
+ @control_1 = "Am 13.04.1899 ist es geschehen - was steht woanders. Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa."
8
+ @string = "Am 13.04.1899 ist es geschehen – was steht woanders. Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa."
9
+ @control_2 = "Am 13.04.1899 ist es geschehen - was steht woanders. Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Null-am e-nim leo, egestas id, condimentum at, laoreet mattis, massa."
10
+ @extra = "Am 13.04.1899 ist es geschehen – was steht woanders. Lo#rem ipsum d#olor sit am%et, cons^ect$etuer adipiscing e@lit. Nam cursus. Morbi ut mi. Null-am e--nim leo, egestas id, condimentum at, laoreet mattis, massa."
11
+ @control_3 = "Am 13.04.1899 ist es geschehen - was steht woanders. Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Null-am e-nim leo, egestas id, condimentum at, laoreet mattis, massa."
12
+ @getaggt = "Am 13.04.1899 ist es geschehen – was steht woanders. Lo#rem ipsum d#olor sit am%et, cons^ect$etuer adipiscing e@lit. Nam cursus. Morbi ut mi. Null-am e--nim leo, egestas id, condimentum at, laoreet mattis, massa."
13
+
14
+ @real = "Title editierbar machen ✗ mmh, ist wohl besser, wenn es so bleibt, beim ersten anlegen wird ein Titel gebastelt und gut ist, so bleibt auch der slug konsistent deployen auf Arielle, evtl. über einen BitBucket Account (als Repository) ✓"
15
+ @control_r = "Title editierbar machen mmh, ist wohl besser, wenn es so bleibt, beim ersten anlegen wird ein Titel gebastelt und gut ist, so bleibt auch der slug konsistent deployen auf Arielle, evtl. über einen BitBucket Account als Repository"
16
+
17
+ @real_2 = "<h3>ToDos: Think</h3><div><ul><li>Title editierbar machen&nbsp;✗</li><ul><li>mmh, ist wohl besser, wenn es so bleibt, beim ersten anlegen wird ein Titel gebastelt und gut ist, so bleibt auch der slug konsistent</li></ul><li>deployen auf Arielle, evtl. über einen BitBucket Account (als Repository)&nbsp;✓</li><li>Abbildungen und andere Objekte als Attachement ermöglichen</li><li>Tabellen erstellen (evtl. Berechnungen?)</li><ul><li>dafür wird wohl ein anderer Editor nötig sein, folgende kämen dafür in Frage:</li><ul><li><a href='http://jejacks0n.github.com/mercury/' title='MercuryEdit' target='_blank'>MercuryEdit</a><br></li><li><a href='http://www.aloha-editor.org/' title='AlohaEditor' target='_blank'>AlohaEditor</a><br></li><li><a href='http://ckeditor.com/' title='CKEditor' target='_blank'>CKEditor</a><br></li></ul><li>wahrscheinlich kommt der&nbsp;<a href='http://www.aloha-editor.org/' title='AlohaEditor' target='_blank' style='margin-top: 0px; margin-right: 0px; margin-bottom: 0px; margin-left: 0px; padding-top: 2px; padding-right: 3px; padding-bottom: 2px; padding-left: 3px; border-top-width: 0px; border-right-width: 0px; border-bottom-width: 0px; border-left-width: 0px; border-style: initial; border-color: initial; font-size: 13px; font: inherit; vertical-align: baseline; border-style: initial; border-color: initial; border-style: initial; border-color: initial; border-style: initial; border-color: initial; border-style: initial; border-color: initial; border-style: initial; border-color: initial; color: rgb(153, 153, 153); text-decoration: none; '>AlohaEditor</a>&nbsp;in Frage ( insbesondere:&nbsp;<a href='http://www.aloha-editor.org/demos/aloha-world-example/' title='AlohaEditor|World' target='_blank'>AlohaEditor|World</a>)</li></ul><li>Referenzierung unter den Notes ermöglichen</li></ul></div>"
18
+ end
19
+
20
+ it "should replace a string by itself" do
21
+ Rstt.set_input lang: "en", content: @string
22
+ Rstt.preprocessing
23
+ Rstt.content.should == @control_1
24
+ end
25
+
26
+ it "should replace non alphanum caharacters within words with nothings" do
27
+ Rstt.set_input lang: "en", content: @extra
28
+ Rstt.preprocessing
29
+ Rstt.content.should == @control_2
30
+ end
31
+
32
+ it "should delete non letter characters" do
33
+ Rstt.set_input lang: "de", content: @real
34
+ Rstt.preprocessing
35
+ Rstt.content.should == @control_r
36
+ end
37
+
38
+ it "should clean up input from html and non word characters" do
39
+ Rstt.set_input lang: "de", content: @real_2
40
+ Rstt.preprocessing
41
+ end
42
+
43
+ describe "preprocessing:html" do
44
+ before(:each) do
45
+ path = File.join(File.dirname(__FILE__), '..', 'tmp','tmp.html')
46
+ @html_file = File.open(path)
47
+ end
48
+
49
+ it "should load the given html file" do
50
+ file = @html_file.read
51
+ Rstt.set_input lang: "en", content: file
52
+ Rstt.preprocessing
53
+ Rstt.content.should_not match(/<\/?[^>]*>/)
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,117 @@
1
+ # coding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe Rstt do
5
+ describe "module variables" do
6
+ describe "responds to .." do
7
+ it "should respond to :lang" do
8
+ Rstt.respond_to?(:lang).should be_true
9
+ end
10
+
11
+ it "should respond to :content" do
12
+ Rstt.respond_to?(:content).should be_true
13
+ end
14
+
15
+ it "should respond to :origin" do
16
+ Rstt.respond_to?(:origin).should be_true
17
+ end
18
+
19
+ it "should respond to :tagged" do
20
+ Rstt.respond_to?(:tagged).should be_true
21
+ end
22
+ end # responds to ..
23
+
24
+ describe "getting" do
25
+ it "should print both" do
26
+ Rstt.set_input lang: "de", content: "Das ist ein einfacher Dummy Satz."
27
+ Rstt.print.should be_true
28
+ end
29
+ end
30
+
31
+ describe "setting" do
32
+ before(:each) do
33
+ @input = {lang: "de", content: "Das ist ein einfacher Dummy Satz."}
34
+ end
35
+
36
+ it "should set both by :set_input " do
37
+ Rstt.set_input @input
38
+ Rstt.lang.should == @input[:lang]
39
+ Rstt.content.should == @input[:content]
40
+ end
41
+ end # setting
42
+ end # module variables
43
+
44
+ describe "tagging stages" do
45
+ before(:each) do
46
+ @input = {lang: "de", content: "Das ist ein einfacher Dummy Satz."}
47
+ end
48
+
49
+ it "should pos tagging on given input data" do
50
+ Rstt.set_input @input
51
+ Rstt.tagging
52
+ end
53
+
54
+ it "origin should be content after preprocessing" do
55
+ Rstt.set_input @input
56
+ Rstt.preprocessing
57
+ Rstt.origin.should == @input[:content]
58
+ end
59
+
60
+ describe "language finding and command building" do
61
+ it "should find the right language dependent on input" do
62
+ Rstt.set_input lang: "en", content: ""
63
+ Rstt.get_command_language.should == ("english")
64
+ end
65
+
66
+ it "should raise an exception if language not supported" do
67
+ Rstt.set_input lang: "xy", content: ""
68
+ expect {Rstt.get_command_language}.to raise_error
69
+ end
70
+
71
+ it "should raise an exception if language not installed" do
72
+ Rstt.set_input lang: "ru", content: ""
73
+ expect {Rstt.get_command_language}.to raise_error
74
+
75
+ end
76
+
77
+ it "should build the correct tagging command, dependend on input language" do
78
+ Rstt.set_input lang: "en", content: ""
79
+ Rstt.build_tagging_command.should == ("tree-tagger-english")
80
+ end
81
+
82
+ it "should prefer utf8 over other" do
83
+ Rstt.set_input lang: "de", content: ""
84
+ Rstt.build_tagging_command.should == ("tree-tagger-german-utf8")
85
+ end
86
+ end # language finding and command building
87
+
88
+ describe "output format" do
89
+ before(:each) do
90
+ @input = {lang: "de", content: "Das ist ein einfacher Dummy Satz"}
91
+ end
92
+ it "should be an array as output" do
93
+ Rstt.set_input @input
94
+ Rstt.preprocessing
95
+ Rstt.tagging.should be_a Array
96
+ end
97
+ it "should have correct count" do
98
+ Rstt.set_input @input
99
+ verfifier = @input[:content].split(" ").length
100
+ Rstt.preprocessing
101
+ Rstt.tagging
102
+ Rstt.tagged.should have(verfifier).things
103
+ end
104
+
105
+ # testing output format, means: for each word in given input gives an array
106
+ # with: 1. input word, 2. tagging category, 3. lemma
107
+ # could be done over output length
108
+ it "should have correct output format" do
109
+ Rstt.set_input @input
110
+ Rstt.preprocessing
111
+ Rstt::LANGUAGES
112
+ Rstt.tagging.first.should have(3).things
113
+ end
114
+ end # output format
115
+
116
+ end # tagging stages
117
+ end
@@ -0,0 +1,17 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper.rb"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+
8
+ require "rstt"
9
+
10
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
11
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
12
+
13
+ RSpec.configure do |config|
14
+ config.treat_symbols_as_metadata_keys_with_true_values = true
15
+ config.run_all_when_everything_filtered = true
16
+ config.filter_run :focus
17
+ end
@@ -0,0 +1,146 @@
1
+ <h2>und hier kommt dummy text</h2>
2
+ <p>
3
+ <mark data-related='hamburg' class='glossary'>Hamburg</mark>
4
+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa.
5
+ Sed eleifend nonummy diam. Praesent mauris ante, elementum et, bibendum at, posuere sit amet, nibh. Duis tincidunt lectus quis dui viverra vestibulum.
6
+ Suspendisse vulputate aliquam dui. Nulla elementum dui ut augue. Aliquam vehicula mi at mauris. Maecenas placerat, nisl at consequat rhoncus, sem nunc
7
+ gravida justo, quis eleifend arcu velit quis lacus. Morbi magna magna, tincidunt a, mattis non, imperdiet vitae, tellus. Sed odio est, auctor ac,
8
+ sollicitudin in, consequat vitae, orci. Fusce id felis. Vivamus sollicitudin metus eget eros.Pellentesque habitant morbi tristique senectus et netus et
9
+ malesuada fames ac turpis egestas. In posuere felis nec tortor. Pellentesque faucibus. Ut accumsan ultricies elit. Maecenas at justo id velit placerat
10
+ molestie. Donec dictum lectus non odio. Cras a ante vitae enim iaculis aliquam. Mauris nunc quam, venenatis nec, euismod sit amet, egestas placerat, est.
11
+ Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Cras id elit. Integer quis urna. Ut ante enim, dapibus malesuada,
12
+ fringilla eu, condimentum quis, tellus. Aenean porsttitor eros vel dolor. Donec convallis pede venenatis nibh. Duis quam. Nam eget lacus. Aliquam erat volutpat.
13
+ Quisque dignissim congue leo.
14
+ </p>
15
+ <figure>
16
+ <figcaption>
17
+ <span class='caption'>Abb. 1:</span>
18
+ <span class='content'>Und hier kommt die Caption zum Bild rein.</span>
19
+ </figcaption>
20
+ </figure>
21
+ <p>
22
+ <mark data-related='hamburg' class='glossary'>Hamburg</mark>
23
+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa.
24
+ Sed eleifend nonummy diam. Praesent mauris ante, elementum et, bibendum at, posuere sit amet, nibh. Duis tincidunt lectus quis dui viverra vestibulum.
25
+ Suspendisse vulputate aliquam dui. Nulla elementum dui ut augue. Aliquam vehicula mi at mauris. Maecenas placerat, nisl at consequat rhoncus, sem nunc
26
+ gravida justo, quis eleifend arcu velit quis lacus. Morbi magna magna, tincidunt a, mattis non, imperdiet vitae, tellus. Sed odio est, auctor ac,
27
+ sollicitudin in, consequat vitae, orci. Fusce id felis. Vivamus sollicitudin metus eget eros.Pellentesque habitant morbi tristique senectus et netus et
28
+ malesuada fames ac turpis egestas. In posuere felis nec tortor. Pellentesque faucibus. Ut accumsan ultricies elit. Maecenas at justo id velit placerat
29
+ molestie. Donec dictum lectus non odio. Cras a ante vitae enim iaculis aliquam. Mauris nunc quam, venenatis nec, euismod sit amet, egestas placerat, est.
30
+ Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Cras id elit. Integer quis urna. Ut ante enim, dapibus malesuada,
31
+ fringilla eu, condimentum quis, tellus. Aenean porsttitor eros vel dolor. Donec convallis pede venenatis nibh. Duis quam. Nam eget lacus. Aliquam erat volutpat.
32
+ Quisque dignissim congue leo.
33
+ </p>
34
+ <h2>und hier kommt dummy text</h2>
35
+ <p>
36
+ <mark data-related='hamburg' class='glossary'>Hamburg</mark>
37
+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa.
38
+ Sed eleifend nonummy diam. Praesent mauris ante, elementum et, bibendum at, posuere sit amet, nibh. Duis tincidunt lectus quis dui viverra vestibulum.
39
+ Suspendisse vulputate aliquam dui. Nulla elementum dui ut augue. Aliquam vehicula mi at mauris. Maecenas placerat, nisl at consequat rhoncus, sem nunc
40
+ gravida justo, quis eleifend arcu velit quis lacus. Morbi magna magna, tincidunt a, mattis non, imperdiet vitae, tellus. Sed odio est, auctor ac,
41
+ sollicitudin in, consequat vitae, orci. Fusce id felis. Vivamus sollicitudin metus eget eros.
42
+ </p>
43
+ <p>
44
+ <mark data-related='hamburg' class='glossary'>Hamburg</mark>
45
+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa.
46
+ Sed eleifend nonummy diam. Praesent mauris ante, elementum et, bibendum at, posuere sit amet, nibh. Duis tincidunt lectus quis dui viverra vestibulum.
47
+ Suspendisse vulputate aliquam dui. Nulla elementum dui ut augue. Aliquam vehicula mi at mauris. Maecenas placerat, nisl at consequat rhoncus, sem nunc
48
+ gravida justo, quis eleifend arcu velit quis lacus. Morbi magna magna, tincidunt a, mattis non, imperdiet vitae, tellus. Sed odio est, auctor ac,
49
+ sollicitudin in, consequat vitae, orci. Fusce id felis. Vivamus sollicitudin metus eget eros.Pellentesque habitant morbi tristique senectus et netus et
50
+ malesuada fames ac turpis egestas. In posuere felis nec tortor. Pellentesque faucibus. Ut accumsan ultricies elit. Maecenas at justo id velit placerat
51
+ molestie. Donec dictum lectus non odio. Cras a ante vitae enim iaculis aliquam. Mauris nunc quam, venenatis nec, euismod sit amet, egestas placerat, est.
52
+ Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Cras id elit. Integer quis urna. Ut ante enim, dapibus malesuada,
53
+ fringilla eu, condimentum quis, tellus. Aenean porsttitor eros vel dolor. Donec convallis pede venenatis nibh. Duis quam. Nam eget lacus. Aliquam erat volutpat.
54
+ Quisque dignissim congue leo.
55
+ </p>
56
+ <figure>
57
+ <figcaption>
58
+ <span class='caption'>Tab. 1:</span>
59
+ <span class='content'>Einige arithmetische Grundrechnungen (das kleine Ein-mal-Eins)</span>
60
+ </figcaption>
61
+ <table>
62
+ <thead>
63
+ <tr>
64
+ <th>#</th>
65
+ <th>#+#</th>
66
+ <th>#*#-#</th>
67
+ <th>#*#</th>
68
+ <th>#</th>
69
+ <th>#+#</th>
70
+ <th>#*#-#</th>
71
+ <th>#*#</th>
72
+ <th>#</th>
73
+ <th>#+#</th>
74
+ <th>#*#-#</th>
75
+ <th>#*#</th>
76
+ <th>#</th>
77
+ <th>#+#</th>
78
+ <th>#*#-#</th>
79
+ <th>#*#</th>
80
+ </tr>
81
+ </thead>
82
+ <tbody>
83
+ <tr>
84
+ <td>1</td>
85
+ <td>2</td>
86
+ <td>0</td>
87
+ <td>1</td>
88
+ <td>1</td>
89
+ <td>2</td>
90
+ <td>0</td>
91
+ <td>1</td>
92
+ <td>1</td>
93
+ <td>2</td>
94
+ <td>0</td>
95
+ <td>1</td>
96
+ <td>1</td>
97
+ <td>2</td>
98
+ <td>0</td>
99
+ <td>1</td>
100
+ </tr>
101
+ <tr>
102
+ <td>2</td>
103
+ <td>4</td>
104
+ <td>2</td>
105
+ <td>4</td>
106
+ <td>2</td>
107
+ <td>4</td>
108
+ <td>2</td>
109
+ <td>4</td>
110
+ <td>2</td>
111
+ <td>4</td>
112
+ <td>2</td>
113
+ <td>4</td>
114
+ <td>2</td>
115
+ <td>4</td>
116
+ <td>2</td>
117
+ <td>4</td>
118
+ </tr>
119
+ <tr>
120
+ <td>3</td>
121
+ <td>6</td>
122
+ <td>6</td>
123
+ <td>9</td>
124
+ <td>3</td>
125
+ <td>6</td>
126
+ <td>6</td>
127
+ <td>9</td>
128
+ <td>3</td>
129
+ <td>6</td>
130
+ <td>6</td>
131
+ <td>9</td>
132
+ <td>3</td>
133
+ <td>6</td>
134
+ <td>6</td>
135
+ <td>9</td>
136
+ </tr>
137
+ </tbody>
138
+ </table>
139
+ </figure>
140
+ <div class='related' id='hamburg'>
141
+ na und hier kommen dann irgendwelche Erklärungen zu Hamburg hinein
142
+ </div>
143
+ <ol>
144
+ <li>foo</li>
145
+ <li>bar</li>
146
+ </ol>
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rstt
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.9.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - LeFnord
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-06 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70269725598760 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70269725598760
25
+ - !ruby/object:Gem::Dependency
26
+ name: yard
27
+ requirement: &70269725613740 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70269725613740
36
+ - !ruby/object:Gem::Dependency
37
+ name: syntax
38
+ requirement: &70269725612840 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70269725612840
47
+ - !ruby/object:Gem::Dependency
48
+ name: thor
49
+ requirement: &70269725612400 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *70269725612400
58
+ - !ruby/object:Gem::Dependency
59
+ name: activesupport
60
+ requirement: &70269725611800 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *70269725611800
69
+ - !ruby/object:Gem::Dependency
70
+ name: celluloid
71
+ requirement: &70269725611320 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: *70269725611320
80
+ - !ruby/object:Gem::Dependency
81
+ name: slop
82
+ requirement: &70269725610860 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ type: :runtime
89
+ prerelease: false
90
+ version_requirements: *70269725610860
91
+ description: another ruby wrapper for Stuttgarter Tree Tagger
92
+ email:
93
+ - pscholz.le@gmail.com
94
+ executables:
95
+ - rstt
96
+ extensions: []
97
+ extra_rdoc_files: []
98
+ files:
99
+ - .DS_Store
100
+ - .gitignore
101
+ - .rspec
102
+ - Gemfile
103
+ - Gemfile.lock
104
+ - LICENSE.txt
105
+ - README.md
106
+ - Rakefile
107
+ - bin/rstt
108
+ - lib/rstt.rb
109
+ - lib/rstt/preprocess.rb
110
+ - lib/rstt/version.rb
111
+ - lib/tasks/config.thor
112
+ - rstt.gemspec
113
+ - spec/rstt_preprocessing_spec.rb
114
+ - spec/rstt_spec.rb
115
+ - spec/spec_helper.rb
116
+ - tmp/tmp.html
117
+ - todos.task
118
+ homepage: ''
119
+ licenses: []
120
+ post_install_message:
121
+ rdoc_options: []
122
+ require_paths:
123
+ - lib
124
+ - bin
125
+ required_ruby_version: !ruby/object:Gem::Requirement
126
+ none: false
127
+ requirements:
128
+ - - ! '>='
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ none: false
133
+ requirements:
134
+ - - ! '>='
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubyforge_project: rstt
139
+ rubygems_version: 1.8.15
140
+ signing_key:
141
+ specification_version: 3
142
+ summary: another ruby wrapper for Stuttgarter Tree Tagger
143
+ test_files:
144
+ - spec/rstt_preprocessing_spec.rb
145
+ - spec/rstt_spec.rb
146
+ - spec/spec_helper.rb
147
+ has_rdoc: