RubyGems - treat - Versions diffs - 1.2.0 → 2.0.0rc1 - Mend

treat 1.2.0 → 2.0.0rc1

Files changed (217) hide show

data/LICENSE +2 -2
data/README.md +12 -21
data/lib/treat/autoload.rb +44 -0
data/lib/treat/config/config.rb +38 -0
data/lib/treat/config/configurable.rb +51 -0
data/lib/treat/config/data/config.rb +50 -0
data/lib/treat/config/data/core.rb +52 -0
data/lib/treat/config/data/databases.rb +10 -0
data/lib/treat/config/data/entities.rb +15 -0
data/lib/treat/config/data/languages/agnostic.rb +31 -0
data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
data/lib/treat/config/data/languages/english.rb +95 -0
data/lib/treat/config/data/languages/french.rb +148 -0
data/lib/treat/config/data/languages/german.rb +135 -0
data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
data/lib/treat/config/data/languages/italian.rb +162 -0
data/lib/treat/config/data/languages/polish.rb +11 -0
data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
data/lib/treat/config/data/languages/spanish.rb +291 -0
data/lib/treat/config/data/languages/swedish.rb +289 -0
data/lib/treat/config/data/libraries.rb +12 -0
data/lib/treat/config/data/linguistics.rb +44 -0
data/lib/treat/config/data/tags.rb +328 -0
data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
data/lib/treat/config/importable.rb +31 -0
data/lib/treat/config/paths.rb +23 -0
data/lib/treat/config/tags.rb +37 -0
data/lib/treat/core/dsl.rb +55 -0
data/lib/treat/{installer.rb → core/installer.rb} +10 -12
data/lib/treat/core/server.rb +40 -0
data/lib/treat/entities/entities.rb +101 -0
data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
data/lib/treat/entities/entity/debuggable.rb +86 -0
data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
data/lib/treat/entities/entity/registrable.rb +36 -0
data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
data/lib/treat/entities/entity.rb +86 -77
data/lib/treat/exception.rb +3 -0
data/lib/treat/helpers/hash.rb +29 -0
data/lib/treat/helpers/help.rb +35 -0
data/lib/treat/helpers/object.rb +55 -0
data/lib/treat/helpers/string.rb +124 -0
data/lib/treat/{core → learning}/data_set.rb +11 -11
data/lib/treat/{core → learning}/export.rb +3 -3
data/lib/treat/{core → learning}/problem.rb +26 -16
data/lib/treat/{core → learning}/question.rb +5 -9
data/lib/treat/loaders/linguistics.rb +8 -9
data/lib/treat/loaders/stanford.rb +5 -11
data/lib/treat/modules.rb +33 -0
data/lib/treat/proxies/array.rb +27 -0
data/lib/treat/proxies/language.rb +47 -0
data/lib/treat/proxies/number.rb +18 -0
data/lib/treat/proxies/proxy.rb +25 -0
data/lib/treat/proxies/string.rb +18 -0
data/lib/treat/version.rb +10 -1
data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
data/lib/treat/workers/extractors/language/what_language.rb +8 -6
data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
data/lib/treat/workers/extractors/time/chronic.rb +2 -4
data/lib/treat/workers/extractors/time/nickel.rb +19 -20
data/lib/treat/workers/extractors/time/ruby.rb +2 -1
data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
data/lib/treat/workers/formatters/readers/image.rb +19 -9
data/lib/treat/workers/formatters/readers/odt.rb +2 -1
data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
data/lib/treat/workers/formatters/readers/xml.rb +0 -1
data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
data/lib/treat/workers/processors/chunkers/html.rb +1 -6
data/lib/treat/workers/processors/parsers/enju.rb +2 -4
data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
data/lib/treat/workers/workers.rb +6 -0
data/lib/treat.rb +18 -32
data/models/MANIFEST +1 -0
data/spec/core/data_set.rb +174 -0
data/spec/core/export.rb +52 -0
data/spec/core/problem.rb +144 -0
data/spec/core/question.rb +52 -0
data/spec/{collection.rb → entities/collection.rb} +20 -35
data/spec/{document.rb → entities/document.rb} +3 -54
data/spec/{entity.rb → entities/entity.rb} +10 -9
data/spec/entities/phrase.rb +33 -0
data/spec/{token.rb → entities/token.rb} +0 -57
data/spec/entities/word.rb +3 -0
data/spec/{zone.rb → entities/zone.rb} +0 -26
data/spec/helper.rb +116 -32
data/spec/sandbox.rb +258 -25
data/spec/treat.rb +26 -34
data/spec/workers/agnostic.rb +137 -0
data/spec/workers/english.rb +194 -0
data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
data/spec/workers/examples/english/phrase.xml +5 -0
data/spec/workers/examples/english/test.txt +1 -0
data/spec/workers/language.rb +280 -0
data/spec/workers.rb +28 -0
metadata +122 -105
data/lib/treat/config/core/acronyms.rb +0 -5
data/lib/treat/config/core/encodings.rb +0 -8
data/lib/treat/config/core/entities.rb +0 -2
data/lib/treat/config/core/language.rb +0 -3
data/lib/treat/config/core/paths.rb +0 -8
data/lib/treat/config/core/syntax.rb +0 -1
data/lib/treat/config/core/verbosity.rb +0 -1
data/lib/treat/config/databases/default.rb +0 -1
data/lib/treat/config/databases/mongo.rb +0 -1
data/lib/treat/config/languages/agnostic.rb +0 -34
data/lib/treat/config/languages/english.rb +0 -60
data/lib/treat/config/languages/french.rb +0 -18
data/lib/treat/config/languages/german.rb +0 -18
data/lib/treat/config/languages/italian.rb +0 -12
data/lib/treat/config/languages/polish.rb +0 -12
data/lib/treat/config/languages/spanish.rb +0 -12
data/lib/treat/config/languages/swedish.rb +0 -12
data/lib/treat/config/libraries/punkt.rb +0 -1
data/lib/treat/config/libraries/reuters.rb +0 -1
data/lib/treat/config/libraries/stanford.rb +0 -1
data/lib/treat/config/linguistics/categories.rb +0 -4
data/lib/treat/config/linguistics/punctuation.rb +0 -33
data/lib/treat/config/tags/aligned.rb +0 -221
data/lib/treat/config/tags/enju.rb +0 -71
data/lib/treat/config/tags/paris7.rb +0 -17
data/lib/treat/config/tags/ptb.rb +0 -15
data/lib/treat/config/workers/list.rb +0 -1
data/lib/treat/config.rb +0 -135
data/lib/treat/core.rb +0 -5
data/lib/treat/entities/abilities/copyable.rb +0 -47
data/lib/treat/entities/abilities/debuggable.rb +0 -83
data/lib/treat/entities/abilities/registrable.rb +0 -46
data/lib/treat/entities/collection.rb +0 -40
data/lib/treat/entities/document.rb +0 -10
data/lib/treat/entities/group.rb +0 -18
data/lib/treat/entities/section.rb +0 -13
data/lib/treat/entities/token.rb +0 -47
data/lib/treat/entities/zone.rb +0 -12
data/lib/treat/entities.rb +0 -6
data/lib/treat/helpers/didyoumean.rb +0 -57
data/lib/treat/helpers/escaping.rb +0 -15
data/lib/treat/helpers/formatting.rb +0 -41
data/lib/treat/helpers/objtohash.rb +0 -8
data/lib/treat/helpers/platform.rb +0 -15
data/lib/treat/helpers/reflection.rb +0 -17
data/lib/treat/helpers/temporary.rb +0 -27
data/lib/treat/helpers/verbosity.rb +0 -19
data/lib/treat/helpers.rb +0 -5
data/lib/treat/loaders.rb +0 -10
data/lib/treat/proxies.rb +0 -106
data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
data/spec/core.rb +0 -441
data/spec/phrase.rb +0 -112
data/spec/word.rb +0 -111

data/spec/workers/english.rb ADDED Viewed

@@ -0,0 +1,194 @@
+class Treat::Specs::Workers::English < Treat::Specs::Workers::Language
+  # TODO: parse
+  Scenarios = {
+    tokenize: {
+      group: {
+        examples: [
+          ["Julius Obsequens was a Roman writer who is believed to have lived in the middle of the fourth century AD.",      ["Julius", "Obsequens", "was", "a", "Roman", "writer", "who", "is", "believed", "to", "have", "lived", "in", "the", "middle", "of", "the", "fourth", "century", "AD", "."]],
+          ["The only work associated with his name is the Liber de prodigiis (Book of Prodigies), completely extracted from an epitome, or abridgment, written by Livy; De prodigiis was constructed as an account of the wonders and portents that occurred in Rome between 249 BC-12 BC.", ["The", "only", "work", "associated", "with", "his", "name", "is", "the", "Liber", "de", "prodigiis", "(", "Book", "of", "Prodigies", ")", ",", "completely", "extracted", "from", "an", "epitome", ",", "or", "abridgment", ",", "written", "by", "Livy", ";", "De", "prodigiis", "was", "constructed", "as", "an", "account", "of", "the", "wonders", "and", "portents", "that", "occurred", "in", "Rome", "between", "249", "BC-12", "BC", "."]],
+          ["Of great importance was the edition by the Basle Humanist Conrad Lycosthenes (1552), trying to reconstruct lost parts and illustrating the text with wood-cuts.", ["Of", "great", "importance", "was", "the", "edition", "by", "the", "Basle", "Humanist", "Conrad", "Lycosthenes", "(", "1552", ")", ",", "trying", "to", "reconstruct", "lost", "parts", "and", "illustrating", "the", "text", "with", "wood-cuts", "."]],
+          ["These have been interpreted as reports of unidentified flying objects (UFOs), but may just as well describe meteors, and, since Obsequens, probably, writes in the 4th century, that is, some 400 years after the events he describes, they hardly qualify as eye-witness accounts.", ["These", "have", "been", "interpreted", "as", "reports", "of", "unidentified", "flying", "objects", "(", "UFOs", ")", ",", "but", "may", "just", "as", "well", "describe", "meteors", ",", "and", ",", "since", "Obsequens", ",", "probably", ",", "writes", "in", "the", "4th", "century", ",", "that", "is", ",", "some", "400", "years", "after", "the", "events", "he", "describes", ",", "they", "hardly", "qualify", "as", "eye-witness", "accounts", "."]],
+          ['"At Aenariae, while Livius Troso was promulgating the laws at the beginning of the Italian war, at sunrise, there came a terrific noise in the sky, and a globe of fire appeared burning in the north.', ["\"", "At", "Aenariae", ",", "while", "Livius", "Troso", "was", "promulgating", "the", "laws", "at", "the", "beginning", "of", "the", "Italian", "war", ",", "at", "sunrise", ",", "there", "came", "a", "terrific", "noise", "in", "the", "sky", ",", "and", "a", "globe", "of", "fire", "appeared", "burning", "in", "the", "north", "."]]
+        ],
+        generator: lambda { |entity| entity.tokens.map { |tok| tok.to_s } }
+      }
+    },
+    parse: {
+      group: {
+        examples: [
+          ["A sentence to tokenize.", ["A sentence to tokenize.", "A sentence", "to tokenize",
+          "tokenize"]]
+        ],
+        generator: lambda { |group| group.phrases.map { |phrase| phrase.to_s } }
+      }
+    },
+    segment: {
+      zone: {
+        examples: [
+          ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien. It is the easternmost village of Gozo and has been inhabited since early times. The development of the present settlement began in the second half of the seventeenth century. It is a pleasant and rural place with many natural and historic attractions.", ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien.", "It is the easternmost village of Gozo and has been inhabited since early times.", "The development of the present settlement began in the second half of the seventeenth century.", "It is a pleasant and rural place with many natural and historic attractions."]],
+          ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM. But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio." "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes. This was a further proof of the value of the radio. It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community. An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM.", ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM.", "But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio.", "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes.", "This was a further proof of the value of the radio.", "It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community.", "An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]]
+        ],
+        generator: lambda { |entity| entity.sentences.map { |sent| sent.to_s } }
+      }
+    },
+    tag: {
+      phrase: {
+        examples: [
+          ["I was running", "P"]
+        ]
+      },
+      token: {
+        examples: [
+          ["running", "VBG"],
+          ["man", "NN"],
+          ["2", "CD"],
+          [".", "."],
+          ["$", "$"]
+        ]
+      }
+    },
+    category: {
+      phrase: {
+        examples: [
+          ["I was running", "phrase"]
+        ]
+      },
+      token: {
+        examples: [
+          ["running", "verb"]
+        ]
+      }
+    },
+    ordinal: {
+      word: {
+        examples: [
+          ["20", "twentieth"]
+        ]
+      },
+      number: {
+        examples: [
+          [20, "twentieth"]
+        ]
+      }
+    },
+    cardinal: {
+      word: {
+        examples: [
+          ['20', "twenty"]
+        ]
+      },
+      number: {
+        examples: [
+          [20, "twenty"]
+        ]
+      }
+    },
+    name_tag: {
+      group: {
+        examples: [
+          ["Obama and Sarkozy will meet in Berlin.", ["person", nil, "person", nil, nil, nil, "location"]]
+        ],
+        preprocessor: lambda { |group| group.tokenize },
+        generator: lambda { |group| group.words.map { |word| word.get(:name_tag) } }
+      }
+    },
+    language: { ######
+      entity: {
+        examples: [
+          ["Obama and Sarkozy will meet in Berlin.", "english"]
+        ],
+        preprocessor: lambda { |entity| Treat.core.language.detect = true; entity.do(:tokenize); entity },
+        postprocessor: lambda { |entity| Treat.core.language.detect = false; entity; },
+        generator: lambda { |group| group.words.map { |word| word.get(:name_tag) } }
+      }
+    },
+    stem: {
+      word: {
+        examples: [
+          ["running", "run"]
+        ]
+      }
+    },
+    time: {
+      group: {
+        examples: [
+          ['october 2006', 10]
+        ],
+        generator: lambda { |entity| entity.time.month }
+      }
+    },
+    topics: {
+      document: {
+        examples: [
+          ["./spec/workers/examples/english/test.txt",
+            ['household goods and hardware',
+              'united states of america',
+            'corporate/industrial']]
+          ],
+          preprocessor: lambda { |doc| doc.do :chunk, :segment, :tokenize }
+        },
+        section: {
+          # Must implement
+        },
+        zone: {
+          examples: [
+            ["Michigan, Ohio, Texas - Unfortunately, the RadioShack is closing. This is horrible news for U.S. politics.", ['household goods and hardware', 'united states of america', 'corporate/industrial']]
+          ],
+          preprocessor: lambda { |zone| zone.do :segment, :tokenize }
+        }
+      },
+      topic_words: {
+        collection: {
+          examples: [
+            ["./perf/examples/economist", [""]]
+          ],
+          preprocessor: lambda { |coll| coll.do :chunk, :segment, :tokenize }
+        }
+      },
+      conjugate: {
+        word: {
+          examples: {
+            present_participle: [
+              ["run", "running"]
+            ],
+            infinitive: [
+              ["running", "run"]
+            ]
+          }
+        }
+      },
+      declense: {
+        word: {
+          examples: {
+            singular: [
+              ["men", "man"]
+            ],
+            plural: [
+              ["man", "men"]
+            ]
+          }
+        }
+      },
+      sense: {
+        word: {
+          examples: {
+            synonyms: [
+              ["throw", ["throw", "shed", "cast", "cast off", "shake off", "throw off", "throw away", "drop", "thrust", "give", "flip", "switch", "project", "contrive", "bewilder", "bemuse", "discombobulate", "hurl", "hold", "have", "make", "confuse", "fox", "befuddle", "fuddle", "bedevil", "confound"]]
+            ],
+            antonyms: [
+              ["weak", ["strong"]]
+            ],
+            hypernyms: [
+              ["table", ["array", "furniture", "piece of furniture", "article of furniture", "tableland", "plateau", "gathering", "assemblage", "fare"]]
+            ],
+            hyponyms: [
+              ["furniture", ["baby bed", "baby's bed", "bedroom furniture", "bedstead", "bedframe", "bookcase", "buffet", "counter", "sideboard", "cabinet", "chest of drawers", "chest", "bureau", "dresser", "dining-room furniture", "etagere", "fitment", "hallstand", "lamp", "lawn furniture", "nest", "office furniture", "seat", "sectional", "Sheraton", "sleeper", "table", "wall unit", "wardrobe", "closet", "press", "washstand", "wash-hand stand"]]
+            ]
+          }
+        }
+      },
+    }
+  end

data/spec/workers/examples/english/economist/hungarys_troubles.txt ADDED Viewed

@@ -0,0 +1,46 @@
+Hungary's troubles
+Not just a rap on the knuckles
+THE pressure is piling up on the beleaguered Hungarian government. Today the European Commission threatened it with legal action over several new "cardinal" laws that would require a two-thirds majority in parliament to overturn.
+The commission is still considering the laws, but today it highlighted concerns over three issues:
+ - The independence of the central bank. Late last year the Hungarian parliament passed a law which expands the monetary council and takes the power to nominate deputies away from the governor and hands it to the prime minister. A separate law opens the door to a merger between the bank and the financial regulator.
+ - The judiciary. More than 200 judges over the age of 62 have been forced into retirement and hundreds more face the sack. The new National Judicial Authority is headed by Tünde Handó, a friend of the family of Viktor Orban, the prime minister.
+ - The independence of the national data authority.
+That wasn't all the commission had to say today. Hungary also received a ticking-off from Olli Rehn (pictured), the economic-affairs commissioner, for not doing enough to tackle its budget deficit. It may now lose access to EU funds.
+Slammed in Brussels, the Hungarian government is also under pressure at home. Earlier this week Gordon Bajnai, who served as Socialist prime minister from 2009-10, fired off a broadside that sent shockwaves through the political and media establishments.
+After a year and a half of government by the right-wing Fidesz party, wrote Mr Bajnai in a lengthy article on the website of the Patriotism and Progress Public Policy Foundation, democracy has been destroyed in Hungary. The country, he warned, is scarred by division and is drifting towards bankruptcy and away from Europe.
+Mr Bajnai called for a radical change of government and a complete political re-orientation. “A new government must have a programme readily at hand that can be applied without delay: a programme that promotes the republic, reconciliation, and recovery.”
+Fidesz is rattled by Mr Bajnai, who since leaving office has been teaching at Columbia University in New York. Understandably so. He headed a technocratic administration which stabilised the economy. Unlike his Socialist predecessor, Ferenc Gyurcsany, he was neither part of the old Communist elite nor connected to it by marriage, and so cannot be smeared as a "Komcsi". He is modern in outlook and well regarded internationally.
+Moreover, say those how know him, Mr Bajnai has little patience for the narcissistic exceptionalism that shapes Fidesz’s worldview. Exhibit A: the plaintive cry of Janos Martonyi, the foreign minister, who lamented recently: “The world will never understand our pains and spiritual wounds.” Such self-pity is unlikely to endear the Hungarian government to Brussels or Washington DC (to where it has sent an envoy this week to negotiate with the IMF).
+Fidesz won a two-thirds majority in 2010. But its support is evaporating, and analysts say there is a gap in the political market for a centrist pro-business party committed to democratic norms. Mr Bajnai, who has not ruled out a return to politics, would be an obvious candidate to lead it.
+Meanwhile, as Hungarians watch the value of their assets vaporise, in large part thanks to the government’s increasingly erratic policies, Mr Orban smirks his way through press conferences. Here he is dodging questions from a reporter from HVG, an economics weekly, about his responsibility for the crisis and trying to shift the blame to his old enemy Andras Simor, president of the central bank. The interview ran as follows:
+hvg.hu: Do you feel responsible for the falling/weakening forint?
+Mr Orban: You mean the president of the central bank? He did not comment on it.
+hvg.hu: No, you, Mr prime minister!
+Mr Orban: The personal responsibility of the president of the central bank was not discussed over the meeting.
+hvg.hu: You, your personal…!
+Mr Orban: That neither.
+Surrounded by yes-men and grinning flunkies, Mr Orban seems increasingly out of touch. His future will likely be decided not in the gilded corridors of the Hungarian parliament, but in Brussels and Washington DC.
+What happens next? If his hand is forced Mr Orban can probably endure policy reversals on the independence of the central bank and the data ombudsman. Sorry, he would say to his loyal followers: national crisis, what can you do.
+The dismantling of the judiciary would be another matter. If outsiders keep up the pressure and the judicial changes are judged to be in breach of the EU treaty, Mr Orban would be in a tricky spot. It’s hard to see how he could declare the 200-plus judges his government has forced into retirement ready for office after all, and still sit in his own.

data/spec/workers/examples/english/economist/saving_the_euro.odt ADDED Viewed

Binary file

data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw RENAMED Viewed

File without changes

data/spec/{samples → workers/examples/english}/mathematicians/euler.html RENAMED Viewed

File without changes

data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf RENAMED Viewed

File without changes

data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt RENAMED Viewed

File without changes

data/spec/{samples → workers/examples/english}/mathematicians/newton.doc RENAMED Viewed

File without changes

data/spec/workers/examples/english/phrase.xml ADDED Viewed

@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="us-ascii" ?>
+<treat>
+<sentence id='70233694858140'>
+A test entity.</sentence>
+</treat>

data/spec/workers/examples/english/test.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ A Rough Day for Republicans\n Michigan, Ohio, Texas - Unfortunately, the RadioShack is closing. This is horrible news for U.S. politics.

data/spec/workers/language.rb ADDED Viewed

@@ -0,0 +1,280 @@
+module Treat::Specs::Workers
+  class Language
+    include Treat::Core::DSL
+    @@list = []
+    # Headings for the list of workers table.
+    BenchmarkHeadings =
+    ['Method', 'Worker', 'Description',
+    'Reference', 'User time', 'System time',
+    'Real time', 'Accuracy']
+    # Add the language to the list,
+    # and define an initialize method.
+    def self.inherited(base)
+      @@list << base
+      base.class_eval do
+        def initialize(mode)
+          klass = self.class.const_get(:Scenarios)
+          @scenarios, @mode = klass, mode
+          @language = self.class.mn.downcase
+        end
+      end
+    end
+    # Return the list of registered languages.
+    def self.list; @@list; end
+    # Default options for #run.
+    DefaultOptions = { save_html: true }
+    # Runs the benchmarks or spec tasks.
+    def run(options = {})
+      options = DefaultOptions.merge(options)
+      results = run_scenarios
+      if @mode == 'benchmark'
+        l = @language.capitalize
+        print "\n\nBenchmark for #{l}\n"
+        Treat::Specs::Helper.text_table(
+        BenchmarkHeadings, results)
+        if options[:save_html]
+          Treat::Specs::Helper.html_table(
+          BenchmarkHeadings, results)
+        end
+      end
+    end
+    # Run all scenarios for a language, for all of the
+    # algorithm categories (e.g. Processors, Extractors).
+    def run_scenarios
+      categories = Treat.languages[
+      @language].workers
+      results = []
+      method = "run_scenarios_as_#{@mode}s"
+      categories.members.each do |cat|
+        category = categories[cat]
+        category.members.each do |grp|
+          group = category[grp]
+          group_class = Treat::Workers.
+          const_get(cat.cc).
+          const_get(grp.cc)
+          #next unless group_class ==
+          #Treat::Workers::Learners::Classifiers
+          group.each do |worker|
+            next if worker == :mongo  # FIXME
+            next if worker == :html   # FIXME
+            next if worker == :lda    # FIXME
+            results << send(method,
+            worker, group_class)
+          end
+        end
+      end
+      results
+    end
+    # Run all benchmarks.
+    def run_scenarios_as_benchmarks(worker, group)
+      info = get_worker_info(worker, group)
+      description, reference =
+      info[:description], info[:reference]
+      accuracy = 0
+      time = ::Benchmark.measure do |x|
+        accuracy = run_scenarios_for_all_workers(
+        worker, group, 'benchmark')
+      end
+      # Return a row for the table.
+      [ group.method.to_s, worker.to_s,
+        description.strip,
+        reference ? reference : '-',
+        time.utime.round(4).to_s,
+        time.stime.round(4).to_s,
+        time.real.round(4).to_s,
+        accuracy ]
+    end
+    # Run examples as specs on each
+    # of the worker's target entities.
+    def run_scenarios_as_specs(worker, group)
+      run_scenarios_for_all_workers(worker, group, 'spec')
+    end
+    # Run a scenario (i.e. spec or benchmark
+    # all workers available to perform a given
+    # method call in a certain language).
+    def run_scenarios_for_all_workers(worker, group, mode)
+      accuracy = 0; i = 0; n = 0
+      method = "run_worker_#{mode}s"
+      group.targets.each do |target|
+        next if target == :section ### FIXME
+        i2, n2 = send(method, worker, group, target)
+        i += i2; n += n2
+      end
+      # Return the accuracy of the worker.
+      accuracy = (i.to_f/n.to_f*100).round(2)
+      accuracy
+    end
+    # Run all examples available to test the worker
+    # on a given target entity type as benchmarks.
+    # Outputs [# successes, # tries].
+    def run_worker_benchmarks(worker, group, target)
+      scenario = find_scenario(group.method, target)
+      return [0, 1] unless scenario
+      scenario = @scenarios[group.method][target]
+      if scenario[:examples].is_a?(Hash)
+        i, n = run_scenario_presets(
+        worker, group, target, scenario)
+      else
+        i, n = Treat::Specs::Workers::Language.
+        run_examples(worker, group, target, scenario)
+      end
+      [i, n]
+    end
+    # Run all examples available to test the worker
+    # on a given target entity type as RSpec tests.
+    def run_worker_specs(worker, group, target)
+      scenario = find_scenario(group.method, target)
+      return [0, 1] unless scenario
+      does = Treat::Specs::Workers::
+      Descriptions[group.method]
+      i = 0; n = 0;
+      rspec_task = RSpec::Core::ExampleGroup.describe(group) do
+        context "when it is called on a #{target}" do
+          if scenario[:examples].is_a?(Hash) && group.preset_option
+            preset_examples = scenario[:examples]
+            preset_examples.each do |preset, examples|
+              context "and #{group.preset_option} is set to #{preset}" do
+                it does[preset] do
+                  options = {group.preset_option => preset}
+                  bm = scenario.dup; bm[:examples] = examples
+                  i2, n2 = *Treat::Specs::Workers::Language.
+                  run_examples(worker, group, target, bm, options)
+                  (i2.to_f/n2.to_f*100).round(2).should eql 100.0
+                  i += i2; n += n2
+                end
+              end
+            end
+          else
+            it does do
+              i, n = Treat::Specs::Workers::Language.
+              run_examples(worker, group, target, scenario)
+              (i.to_f/n.to_f*100).round(2).should eql 100.0
+            end
+          end
+          # Check for accuracy.
+        end
+      end
+      rspec_task.register
+      [i, n]
+    end
+    def self.run_examples(worker, group, target, scenario, options = {})
+      i = 0; n = 0
+      examples, generator, preprocessor =
+      scenario[:examples], scenario[:generator],
+      scenario[:preprocessor]
+      target_class = Treat::Entities.
+      const_get(target.cc)
+      if examples.is_a?(Hash)
+        unless examples[worker]
+          raise Treat::Exception,
+          "No example defined for worker #{worker}."
+        end
+        examples = examples[worker]
+      end
+      examples.each do |example|
+        value, expectation, options2 = *example
+        entity = target_class.build(value)
+        begin
+          if preprocessor
+            preprocessor.call(entity)
+          end
+          if options2.is_a?(::Proc)
+            options2 = options2.call
+          end
+          options = options.merge(options2 || {})
+          if generator
+            result = entity.send(group.
+            method, worker, options)
+            operand = (group.type ==
+            :computer ? result : entity)
+            result = generator.call(operand)
+          else
+            result = entity.send(group.
+            method, worker, options)
+          end
+        rescue Treat::Exception => e
+          puts e.message
+          next
+        end
+        puts result.inspect
+        i += 1 if result == expectation
+        n += 1
+      end
+      (i == 0 && n == 0) ? [1, 1] : [i, n]
+    end
+    # * Helpers * #
+    # Given a method and a target,
+    # find a scenario for the current
+    # language class instance.
+    def find_scenario(method, target)
+      unless @scenarios[method]
+        puts "Warning: there is no scenario for " +
+        "method ##{method} called on " +
+        "#{target.to_s.plural} in the " +
+        "#{@language.capitalize} language."
+        return nil
+      end
+      unless @scenarios[method]
+        puts "Warning: there is a scenario for " +
+        "method ##{method} in the " +
+        "#{@language.capitalize} language, " +
+        "but there are no examples for target " +
+        "entity type '#{target.to_s.plural}'."
+        return nil
+      end
+      @scenarios[method][target]
+    end
+    # Parse out the description and reference from
+    # the Ruby file defining the worker/adapter.
+    def get_worker_info(worker, group)
+      bits = group.to_s.split('::')
+      bits.collect! { |bit| bit.ucc }
+      file = bits.join('/') + "/#{worker}.rb"
+      contents = File.read(Treat.paths.lib + file)
+      head = contents[0...contents.index('class')]
+      parts = head.gsub("\n# ", "\n").gsub('#', '').
+      gsub('encoding: utf-8', '').
+      gsub(/Authors: (.*)/m, ''). # ouch
+      gsub(/License: (.*)/m, '').
+      gsub(/Website: (.*)/m, '').
+      split('Original paper: ')
+      {description: parts[0] || '',
+      reference: parts[1] || '-'}
+    end
+    # Runs a benchmark for each preset.
+    def run_scenario_presets(worker, group, target, scenario)
+      i, n = 0, 0
+      examples = scenario[:examples]
+      examples.each do |preset, examples|
+        options = {group.preset_option => preset}
+        sc = scenario.dup; sc[:examples] = examples
+        i2, n2 = Treat::Specs::Workers::Language.
+        run_examples(worker, group, target, sc, options)
+        i += i2; n += n2
+      end
+      [i, n]
+    end
+  end
+end

data/spec/workers.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module Treat::Specs::Workers
+    Descriptions = {
+      stem: "returns the stem of the word",
+      conjugate: {
+        infinitive: "returns the infinitive form of a verb",
+        present_participle: "returns the present participle form of a verb"
+      },
+      declense: {
+        plural: "returns the plural form of the word",
+        singular: "returns the singular form of the word"
+      },
+      ordinal: "returns the ordinal form of a number",
+      sense: {
+        synonyms: "returns the synonyms of the word",
+        antonyms: "returns the antonyms of the word",
+        hypernyms: "returns the hypernyms of the word",
+        hyponyms:"returns the hyponyms of the word"
+      },
+      tag: "returns the tag of the token",
+      category: "returns the category of the number, punctuation or symbol",
+      name_tag: "tags the named entity words in the group of words",
+      time: "annotates all entities within the group with time information",
+      tokenize: "splits the group of words into tokens and adds them as children of the group",
+      parse: "parses a group of words into its syntax tree, adding nested phrases and tokens as children of the group",
+      topics: "returns a list of general topics the document belongs to",
+      segment: "splits a zone into phrases/sentences and adds them as children of the zone"
+    }
+end