words 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,18 +1,55 @@
1
+ # coding: utf-8
2
+
1
3
  module Words
2
4
 
5
+ # Provides a pure ruby connector to the Wordnet dataset.
3
6
  class PureWordnetConnection
4
7
 
8
+ # Convert single letter POS to it's multi-letter equivilent
5
9
  SHORT_TO_POS_FILE_TYPE = { 'a' => 'adj', 'r' => 'adv', 'n' => 'noun', 'v' => 'verb' }
10
+
11
+ # Set of indexes for seeking directly into wordnet files to identify terms with significantly improved performance
6
12
  INDEXES = {
7
13
  :noun => {"mv"=>2908615, "fa"=>1455677, "g-"=>1695451, "hy"=>2196287, "ac"=>21116, "wr"=>4743086, "rt"=>3724403, "k_"=>2405676, "mw"=>2908680, "fb"=>1539515, "g."=>1695573, "hz"=>2219696, "ad"=>48269, "ws"=>4747643, "ru"=>3724431, "mx"=>2908742, "fc"=>1539583, "80"=>6057, "ae"=>63445, "wt"=>4747670, "rv"=>3740230, "ka"=>2405742, "l-"=>2459655, "my"=>2908771, "fd"=>1539637, "af"=>68288, "wu"=>4747756, "rw"=>3740258, "kb"=>2417524, "l."=>2459745, "fe"=>1539722, "ag"=>74279, "wv"=>4748078, "kc"=>2417632, "ah"=>83260, "ry"=>3740424, "pa"=>3143343, "36"=>5141, "ai"=>83677, "ww"=>4748110, "pb"=>3211047, "ke"=>2417664, "aj"=>91267, "v-"=>4545234, "pc"=>3211172, "fh"=>1559167, "ak"=>91562, "wy"=>4748137, "v."=>4545387, "ua"=>4496561, "pd"=>3211308, "kg"=>2427122, "fi"=>1559226, "al"=>92464, "ub"=>4496594, "pe"=>3211419, "2n"=>4947, "fj"=>1596225, "am"=>130827, "kh"=>2427183, "uc"=>4496797, "pf"=>3263095, "88"=>6083, "an"=>154839, "ki"=>2428739, "za"=>4773142, "ud"=>4496830, "fl"=>1596256, "ao"=>203539, "zb"=>4775763, "ph"=>3263286, "fm"=>1622351, "ap"=>204006, "uf"=>4496962, "pi"=>3293279, "fn"=>1622416, "aq"=>218174, "kk"=>2442519, "zd"=>4775847, "ug"=>4497019, "pj"=>3328895, "fo"=>1622444, "ar"=>219963, "kl"=>2442551, "ze"=>4775874, "uh"=>4497483, "pk"=>3328925, "fp"=>1650875, "as"=>262743, "km"=>2443913, "ui"=>4497543, "pl"=>3329011, "at"=>282628, "kn"=>2443973, "pm"=>3357376, "fr"=>1650935, "au"=>299805, "ko"=>2448754, "zh"=>4778739, "uk"=>4497767, "pn"=>3357459, "fs"=>1681993, "av"=>316371, "kp"=>2453337, "zi"=>4778934, "ul"=>4498102, "po"=>3358476, "ft"=>1682056, "aw"=>319552, "um"=>4501463, "fu"=>1682252, "ax"=>320182, "1-"=>1892, "kr"=>2453390, "un"=>4503199, "pp"=>3416671, "ay"=>321448, "ks"=>2455025, "zl"=>4782157, "fw"=>1695021, "az"=>322115, "kt"=>2455090, "d_"=>1083112, "up"=>4528358, "pr"=>3416755, "o'"=>3029255, "10"=>1959, "ku"=>2455116, "zn"=>4782189, "ps"=>3483993, "e-"=>1289529, "fy"=>1695051, "kv"=>2458073, "da"=>1083181, "zo"=>4782217, "ur"=>4532258, "pt"=>3492829, "i_"=>2220034, "11"=>2218, "kw"=>2458174, "db"=>1108193, "e."=>1289664, "us"=>4538820, "t'"=>4259996, "pu"=>3496345, "60"=>5843, "12"=>2315, "dc"=>1108287, "ut"=>4542211, "pv"=>3517927, "ia"=>2220399, "13"=>2445, "ky"=>2458844, "zr"=>4784927, "pw"=>3517990, "ib"=>2220863, "j."=>2341407, "14"=>2471, "dd"=>1108386, "zs"=>4784956, "uu"=>4544207, "px"=>3518017, "ic"=>2221692, "15"=>2558, "de"=>1108520, "uv"=>4544342, "py"=>3518043, "o."=>3029509, "na"=>2919040, "id"=>2226538, "16"=>2733, "df"=>1168182, "zu"=>4784989, "s_"=>3741387, "nb"=>2942448, "ie"=>2230327, "17"=>2788, "dg"=>1168212, "ux"=>4544722, "t-"=>4260104, "nc"=>2942542, "if"=>2230421, "18"=>3024, "dh"=>1168244, "zw"=>4785347, "uy"=>4544863, "t."=>4260425, "sa"=>3741419, "nd"=>2942608, "ig"=>2230448, "19"=>3319, "di"=>1168953, "x_"=>4749915, "uz"=>4544913, "sb"=>3800209, "ne"=>2942718, "dj"=>1223633, "zy"=>4785471, "y-"=>4755272, "sc"=>3800328, "dk"=>1223962, "xa"=>4749991, "sd"=>3836240, "ng"=>2975369, "ii"=>2232411, "dl"=>1224061, "se"=>3836272, "nh"=>2975761, "ij"=>2232906, "dm"=>1224120, "xc"=>4750937, "sf"=>3898201, "ni"=>2975793, "ik"=>2233046, "dn"=>1224525, "sg"=>3898276, "nj"=>2989622, "il"=>2233145, "do"=>1224823, "y2"=>4755339, "xe"=>4750963, "sh"=>3898399, "im"=>2236982, "dp"=>1251815, "si"=>3934020, "nl"=>2989719, "in"=>2250132, "sj"=>3974373, "nm"=>2989774, "io"=>2317192, "dr"=>1251968, "xh"=>4752829, "sk"=>3974412, "ip"=>2319242, "ds"=>1271920, "xi"=>4752879, "sl"=>3982232, "nn"=>2989842, "iq"=>2320204, "dt"=>1272024, "sm"=>3995291, "no"=>2989930, "ir"=>2320265, "du"=>1272082, "sn"=>4003308, "np"=>3016438, "is"=>2328830, "dv"=>1284206, "xl"=>4753577, "so"=>4011968, "it"=>2336645, "b_"=>324352, "dw"=>1284263, "xm"=>4753603, "sp"=>4051506, "nr"=>3016535, "iu"=>2338757, "0"=>1840, "4-"=>5374, "sq"=>4097051, "ns"=>3016775, "iv"=>2338786, "1"=>1865, "ba"=>324554, "c-"=>600455, "dy"=>1286409, "xo"=>4753634, "sr"=>4102220, "nt"=>3016984, "8_"=>6119, "iw"=>2340321, "2"=>4177, "c."=>600659, "dz"=>1289430, "g_"=>1695801, "nu"=>3017043, "ix"=>2340452, "9-"=>6205, "3"=>4985, "bb"=>390069, "40"=>5406, "ss"=>4102507, "nv"=>3026658, "iy"=>2341048, "4"=>5349, "3d"=>5205, "ga"=>1695861, "h-"=>2030546, "st"=>4102714, "nw"=>3026690, "iz"=>2341117, "9/"=>6238, "5"=>5594, "bd"=>390218, "gb"=>1726120, "h."=>2030576, "su"=>4180331, "90"=>6271, "6"=>5818, "be"=>390276, "c2"=>601143, "gc"=>1726268, "xt"=>4753701, "sv"=>4232564, "ny"=>3026772, "7"=>5946, "la"=>2459898, "m-"=>2643999, "gd"=>1726351, "sw"=>4232896, "q_"=>3524972, "8"=>6032, "lb"=>2507825, "m."=>2644096, "44"=>5495, "ge"=>1726452, "xv"=>4753754, "r-"=>3544131, "9"=>6180, "bh"=>428251, "lc"=>2507915, "h2"=>2030821, "sy"=>4247569, "r."=>3544158, "qa"=>3525003, "bi"=>428758, "ld"=>2507971, "v_"=>4545417, "sz"=>4259706, "bj"=>454188, "le"=>2508074, "m1"=>2644168, "xx"=>4753864, "qc"=>3525650, "bk"=>454250, "lf"=>2545647, "m2"=>2644194, "gh"=>1914825, "xy"=>4754258, "w."=>4622501, "va"=>4545477, "bl"=>454276, "lg"=>2545676, "m3"=>2644220, "gi"=>1915953, "qe"=>3525677, "bm"=>487643, "lh"=>2545732, "gj"=>1928001, "vc"=>4562340, "bn"=>487795, "li"=>2545866, "vd"=>4562367, "bo"=>487822, "lj"=>2588790, "gl"=>1928034, "ve"=>4562424, "bp"=>527090, "gm"=>1941192, "vf"=>4587559, "qi"=>3525733, "3r"=>5272, "gn"=>1941253, "br"=>527207, "ll"=>2588826, "go"=>1942339, "vh"=>4587589, "lm"=>2589254, "gp"=>1965489, "a'"=>6392, "bs"=>567010, "vi"=>4587630, "3t"=>5322, "bt"=>567093, "lo"=>2589280, "gr"=>1965634, "bu"=>567123, "lp"=>2623355, "gs"=>2010072, "bv"=>598604, "vl"=>4612572, "qo"=>3526131, "1_"=>3606, "bw"=>598664, "2-"=>4204, "lr"=>2623408, "gu"=>2010162, "8v"=>6153, "k'"=>2405456, "ls"=>2623434, "a-"=>6423, "by"=>598787, "vo"=>4613249, "lt"=>2623463, "e_"=>1290183, "6_"=>5911, "gw"=>2026208, "a."=>6630, "lu"=>2623552, "'h"=>1740, "20"=>4330, "ea"=>1290252, "gy"=>2026300, "21"=>4385, "vr"=>4621015, "lw"=>2635752, "eb"=>1300178, "f."=>1455392, "j_"=>2341888, "qu"=>3526162, "lx"=>2635783, "ec"=>1301281, "70"=>5971, "22"=>4411, "vt"=>4621044, "ly"=>2635907, "ed"=>1308417, "ja"=>2341922, "k-"=>2405491, "23"=>4474, "vu"=>4621076, "qw"=>3544030, "o_"=>3029601, "k."=>2405619, "24"=>4500, "p-"=>3142944, "ee"=>1316159, "25"=>4636, "p."=>3143064, "oa"=>3029664, "ef"=>1316593, "jd"=>2362188, "26"=>4662, "t_"=>4260563, "p/"=>3143308, "ob"=>3030924, "eg"=>1318289, "je"=>2362216, "27"=>4688, "vx"=>4622352, "u-"=>4495612, "oc"=>3037012, "eh"=>1321628, "jf"=>2371138, "k2"=>2405647, "28"=>4714, "vy"=>4622382, "u."=>4495708, "ta"=>4260664, "od"=>3042646, "ei"=>1321758, "29"=>4740, "y_"=>4755366, "tb"=>4295216, "oe"=>3044953, "ej"=>1323919, "'s"=>1771, "jh"=>2371165, "z-"=>4773112, "tc"=>4295357, "of"=>3046532, "ek"=>1324264, "78"=>5997, "ji"=>2371193, "ya"=>4755402, "td"=>4295640, "og"=>3049310, "el"=>1324361, "yb"=>4759174, "te"=>4295669, "oh"=>3049737, "em"=>1348056, "u3"=>4496533, "oi"=>3050182, "en"=>1357595, "oj"=>3052575, "eo"=>1377701, "ye"=>4759264, "th"=>4330947, "ok"=>3052696, "1s"=>3787, "ep"=>1378260, "ti"=>4366648, "ol"=>3053511, "jn"=>2373545, "eq"=>1387580, "yg"=>4767903, "tj"=>4385574, "om"=>3062383, "jo"=>2373601, "er"=>1391721, "yh"=>4767972, "tk"=>4385664, "on"=>3064512, "d'"=>1082835, "es"=>1401937, "yi"=>4768028, "tl"=>4385691, "et"=>1408856, "tm"=>4385787, "oo"=>3070387, "jr"=>2392018, "eu"=>1413487, "tn"=>4385843, "op"=>3071039, "a"=>6297, "ev"=>1427580, "yl"=>4768444, "to"=>4385934, "b"=>323845, "c_"=>601171, "ew"=>1432034, "ym"=>4768512, "tp"=>4413193, "or"=>3081061, "c"=>600316, "ju"=>2392073, "ex"=>1432298, "n'"=>2918885, "5-"=>5619, "os"=>3115959, "d"=>1082786, "jv"=>2405234, "ca"=>601439, "d-"=>1082871, "ey"=>1452457, "yo"=>4768542, "tr"=>4413220, "ot"=>3122137, "e"=>1289463, "cb"=>712079, "d."=>1082934, "ez"=>1454953, "yp"=>4771198, "ts"=>4466928, "ou"=>3124879, "f"=>1455328, "50"=>5765, "yq"=>4771250, "ov"=>3129739, "ha"=>2030856, "i-"=>2219776, "g"=>1695338, "jy"=>2405260, "cc"=>712135, "yr"=>4771279, "tt"=>4467892, "ow"=>3136728, "hb"=>2076148, "i."=>2219806, "h"=>2030472, "cd"=>712198, "tu"=>4467944, "ox"=>3137307, "hc"=>2076182, "i"=>2219725, "ce"=>712729, "n-"=>2918921, "yt"=>4771310, "tv"=>4484640, "oy"=>3141259, "hd"=>2076237, "j"=>2341367, "cf"=>737620, "ma"=>2644246, "yu"=>4771416, "tw"=>4485217, "r_"=>3544308, "oz"=>3142126, "n."=>2918965, "he"=>2076337, "k"=>2405363, "cg"=>737739, "mb"=>2737124, "yv"=>4773040, "tx"=>4490575, "s-"=>3741191, "hf"=>2121232, "l"=>2459527, "ch"=>737800, "mc"=>2737372, "ty"=>4490610, "s."=>3741222, "ra"=>3544339, "hg"=>2121297, "m"=>2643918, "ci"=>811192, "md"=>2738186, "tz"=>4495399, "s/"=>3741360, "rb"=>3580128, "n"=>2918808, "cj"=>827445, "me"=>2738337, "x-"=>4749199, "rc"=>3580216, "o"=>3029204, "hh"=>2121341, "mf"=>2788090, "wa"=>4622931, "p"=>3142904, "hi"=>2121371, "cl"=>827472, "mg"=>2788180, "wb"=>4654707, "re"=>3580247, "q"=>3524944, "cm"=>860967, "mh"=>2788224, "rf"=>3658425, "r"=>3544069, "cn"=>861094, "mi"=>2788281, "s"=>3741105, "rg"=>3658504, "co"=>861878, "hl"=>2139669, "we"=>4654819, "t"=>4259917, "rh"=>3658530, "cp"=>1012981, "mk"=>2830687, "hm"=>2139701, "u"=>4495561, "ri"=>3667785, "ml"=>2830716, "hn"=>2139877, "v"=>4545170, "cr"=>1013175, "ho"=>2139935, "wh"=>4672549, "w"=>4622437, "cs"=>1048516, "mm"=>2830804, "hp"=>2182075, "x"=>4749153, "wi"=>4692782, "ct"=>1048663, "mn"=>2830893, "4t"=>5531, "hq"=>2182104, "y"=>4755232, "cu"=>1049194, "mo"=>2831144, "hr"=>2182134, "z"=>4773075, "rn"=>3687863, "cv"=>1068811, "mp"=>2881103, "hs"=>2182299, "wl"=>4724359, "ro"=>3688004, "2_"=>4766, "cw"=>1068869, "4w"=>5558, "ht"=>2182563, "a_"=>7001, "wm"=>4724387, "rp"=>3724343, "l'"=>2459588, "3-"=>5010, "mr"=>2881244, "hu"=>2182649, "wn"=>4724445, "b-"=>323934, "cy"=>1068938, "ms"=>2881650, "wo"=>4724472, "b."=>324186, "cz"=>1082090, "mt"=>2881981, "f_"=>1455546, "hw"=>2196252, "aa"=>7256, "wp"=>4743059, "30"=>5078, "mu"=>2882421, ".2"=>1811, "ab"=>8002},
8
14
  :adj => {"2d"=>4592, "31"=>4851, "fa"=>261714, ".3"=>1880, "hy"=>340391, "ac"=>12021, "32"=>4905, "wr"=>818088, ".4"=>2020, "ad"=>18614, "80"=>7432, "ae"=>23100, "ru"=>595594, "ka"=>388840, "l-"=>392296, "33"=>4959, "my"=>455103, "81"=>7486, "af"=>24303, "34"=>5013, "fe"=>267964, "82"=>7512, "ag"=>26180, "rw"=>598392, "35"=>5067, "83"=>7538, "ah"=>28531, "36"=>5121, "pa"=>505816, "84"=>7564, "ai"=>28659, "ke"=>389430, "37"=>5175, "85"=>7590, "aj"=>29502, "38"=>5229, "v-"=>789182, "86"=>7644, "ak"=>29530, "39"=>5283, "fi"=>270598, "wy"=>819172, "87"=>7670, "al"=>29597, "2n"=>4618, "ub"=>722231, "pe"=>515525, "am"=>36386, "kh"=>389925, "88"=>7696, "an"=>40420, "ki"=>389956, "za"=>822848, "fl"=>275697, "89"=>7722, "ao"=>51761, "ph"=>523560, "ap"=>51861, "pi"=>526560, "aq"=>56753, "ug"=>722267, "fo"=>280302, "ar"=>57023, "ze"=>823049, "as"=>62836, "pl"=>530025, "at"=>67162, "kn"=>390913, "fr"=>286656, "au"=>69848, "ko"=>391903, "uk"=>722361, "pn"=>534250, "7t"=>7346, "av"=>73727, "zi"=>823276, "ul"=>722396, "po"=>534415, "aw"=>74649, "um"=>723080, "fu"=>290894, "ax"=>75702, "un"=>723684, "az"=>76033, "up"=>785441, "6-"=>6566, "pr"=>541620, "10"=>2210, "ku"=>392129, "ps"=>556687, "da"=>189020, "zo"=>823490, "pt"=>558010, "11"=>2525, "ur"=>787459, "60"=>6600, "pu"=>558096, "12"=>2691, "us"=>788002, "61"=>6654, "ia"=>343317, "13"=>2857, "ky"=>392202, "ut"=>788423, "62"=>6680, "ib"=>343385, "14"=>3023, "63"=>6706, "ic"=>343452, "15"=>3189, "de"=>191816, "na"=>456102, "o."=>482752, "64"=>6732, "py"=>562611, "id"=>344057, "16"=>3355, "uv"=>788962, "65"=>6786, "17"=>3521, "if"=>345034, "18"=>3687, "t-"=>685912, "ux"=>789057, "66"=>6840, "ig"=>345064, "19"=>3797, "sa"=>598455, "di"=>206077, "67"=>6866, "dj"=>220263, "uz"=>789122, "ne"=>459116, "zy"=>823837, "y-"=>820785, "68"=>6892, "sc"=>604552, "69"=>6918, "xa"=>819260, "ii"=>345421, "se"=>608988, "xc"=>819292, "ni"=>464278, "il"=>345474, "do"=>220299, "y2"=>820817, "xe"=>819571, "im"=>347578, "sh"=>623055, "in"=>353335, "si"=>629245, "io"=>380365, "dr"=>225168, "ip"=>380637, "sk"=>635321, "5t"=>6514, "xi"=>819747, "sl"=>636220, "ir"=>380672, "sm"=>639638, "du"=>227825, "no"=>466328, "is"=>382721, "sn"=>641766, "xl"=>819882, "it"=>383968, "so"=>643211, "dw"=>229873, "0"=>2160, "sp"=>649763, "4-"=>5389, "iv"=>384300, "1"=>2185, "ba"=>76418, "sq"=>656684, "dy"=>229940, "2"=>3934, "sr"=>658125, "nt"=>481308, "ix"=>384390, "9-"=>7800, "3"=>4645, "40"=>5460, "nu"=>481335, "4"=>5364, "41"=>5570, "ga"=>294211, "h-"=>313795, "5"=>6083, "st"=>658188, "42"=>5624, "90"=>7834, "6"=>6541, "be"=>83696, "su"=>671222, "43"=>5678, "91"=>7888, "7"=>6971, "sv"=>681720, "la"=>392328, "ny"=>482674, "92"=>7914, "8"=>7373, "sw"=>681768, "44"=>5732, "ge"=>296692, "xv"=>820134, "93"=>7940, "9"=>7775, "bh"=>89324, "45"=>5786, "r."=>566629, "94"=>7966, "bi"=>89359, "sy"=>683436, "46"=>5840, "qa"=>563600, "95"=>7992, "le"=>398566, "47"=>5894, "xx"=>820244, "96"=>8046, "48"=>5948, "w-"=>800937, "gh"=>299675, "97"=>8072, "bl"=>96334, "49"=>6002, "va"=>789214, "gi"=>299913, "98"=>8098, "li"=>402864, "99"=>8124, "bo"=>102434, "gl"=>300821, "ve"=>791850, "3r"=>5337, "gn"=>303067, "br"=>107474, "go"=>303237, "vi"=>795294, "lo"=>409658, "gr"=>305879, "bu"=>114508, "8t"=>7748, "2-"=>3959, "gu"=>312309, "a-"=>8177, "by"=>118813, "vo"=>799000, "a."=>8266, "lu"=>416153, "7-"=>6996, "20"=>3996, "lv"=>417909, "ea"=>230712, "gy"=>313409, "21"=>4106, "eb"=>232434, "lx"=>418019, "ec"=>232594, "70"=>7030, "qu"=>563668, "22"=>4160, "ly"=>418802, "ed"=>233599, "71"=>7084, "ja"=>384470, "23"=>4214, "vu"=>800557, "72"=>7110, "24"=>4268, "ee"=>234244, "73"=>7136, "25"=>4322, "ef"=>234377, "oa"=>482782, "p."=>505788, "74"=>7162, "26"=>4376, "eg"=>235115, "ob"=>482870, "75"=>7188, "je"=>385496, "27"=>4430, "u-"=>722199, "oc"=>484785, "76"=>7242, "28"=>4484, "ei"=>235526, "od"=>485595, "29"=>4538, "ta"=>685944, "oe"=>486011, "77"=>7268, "of"=>486090, "78"=>7294, "ji"=>386292, "ya"=>820854, "el"=>236340, "79"=>7320, "em"=>238831, "oh"=>487955, "te"=>690141, "en"=>240855, "oi"=>487986, "eo"=>245834, "ye"=>820972, "ok"=>488175, "1s"=>3907, "th"=>695473, "ep"=>245983, "ol"=>488233, "ti"=>702295, "eq"=>247606, "om"=>489492, "jo"=>386595, "er"=>248382, "on"=>489888, "6t"=>6944, "es"=>249587, "yi"=>822204, "et"=>250446, "oo"=>493273, "jr"=>387299, "eu"=>251131, "op"=>493333, "ev"=>252105, "to"=>705018, "or"=>495731, "c"=>118928, "ju"=>387326, "ex"=>253536, "5-"=>6108, "os"=>498199, "d"=>188965, "ca"=>118953, "ey"=>261468, "yo"=>822254, "ot"=>498747, "tr"=>709204, "d."=>188990, "ou"=>499048, "ts"=>717450, "50"=>6142, "ov"=>501567, "ha"=>313827, "cc"=>131547, "51"=>6252, "ow"=>505326, "cd"=>131600, "52"=>6278, "i"=>343292, "ce"=>131626, "tu"=>717516, "n-"=>456074, "53"=>6304, "ox"=>505512, "ma"=>419357, "54"=>6330, "yu"=>822690, "he"=>320515, "k"=>388815, "tw"=>719084, "l"=>392271, "s-"=>598423, "ch"=>134647, "55"=>6356, "ra"=>566657, "m"=>419332, "ci"=>142370, "ty"=>721720, "56"=>6410, "tz"=>722141, "me"=>429139, "57"=>6436, "58"=>6462, "x-"=>819228, "hi"=>327145, "cl"=>144311, "59"=>6488, "wa"=>800969, "re"=>571244, "mi"=>436715, "co"=>150272, "rh"=>587466, "we"=>804320, "hm"=>331476, "ri"=>588287, "u"=>722172, "v"=>789157, "cr"=>176976, "ho"=>331505, "wh"=>809316, "x"=>819203, "ct"=>183185, "mn"=>442650, "4t"=>6056, "wi"=>811263, "cu"=>183216, "mo"=>442756, "9t"=>8150, "cv"=>186983, "ro"=>591099, "a_"=>8294, "cx"=>187009, "3-"=>4670, "hu"=>337473, "cy"=>187229, "cz"=>188827, "wo"=>814766, "30"=>4741, "mu"=>450942, ".2"=>1740, "8-"=>7398, "ab"=>8553},
9
15
  :verb => {"ox"=>317944, "ep"=>169705, "ki"=>261865, "ne"=>302629, "x-"=>522318, "oy"=>318231, "ru"=>392475, "ur"=>502622, "bu"=>56124, "eq"=>169837, "oz"=>318263, "us"=>502895, "aa"=>1740, "er"=>170250, "ut"=>503162, "ab"=>1767, "es"=>170652, "ho"=>232836, "ac"=>3529, "et"=>171392, "ni"=>303882, "ta"=>465047, "ad"=>6490, "da"=>118966, "by"=>61875, "eu"=>171927, "ae"=>8718, "ev"=>172155, "kn"=>263317, "af"=>8929, "ko"=>264585, "ag"=>9437, "ex"=>173105, "te"=>472676, "wa"=>508666, "ga"=>204003, "ey"=>178534, "hu"=>237591, "ai"=>10286, "de"=>120649, "no"=>304585, "th"=>475571, "ti"=>478689, "we"=>512699, "za"=>523414, "ge"=>206371, "ja"=>255802, "hy"=>239280, "al"=>10951, "di"=>136972, "kv"=>264661, "am"=>12800, "wh"=>514913, "an"=>13706, "dj"=>148986, "ze"=>523472, "wi"=>517343, "je"=>256873, "ma"=>280541, "nu"=>305772, "ap"=>16409, "gh"=>210633, "to"=>481065, "aq"=>18267, "gi"=>210790, "ar"=>18347, "zi"=>523585, "as"=>19764, "do"=>149014, "pa"=>318325, "qu"=>356569, "tr"=>483983, "at"=>21915, "gl"=>213421, "ji"=>257383, "me"=>287710, "ts"=>490986, "wo"=>519567, "au"=>23027, "av"=>23935, "dr"=>152421, "gn"=>215270, "g."=>203975, "aw"=>24513, "go"=>215409, "tu"=>491013, "wr"=>520951, "ax"=>24704, "mi"=>290558, "pe"=>324550, "sa"=>395891, "zo"=>523781, "ca"=>61939, "du"=>158291, "tw"=>493797, "az"=>24779, "gr"=>218976, "sc"=>398961, "dw"=>159135, "jo"=>257765, "ph"=>328230, "pi"=>328891, "se"=>403889, "va"=>503309, "ty"=>494840, "fa"=>178632, "dy"=>159383, "gu"=>222269, "ce"=>72643, "mo"=>295025, "pl"=>332016, "sh"=>410924, "si"=>417650, "ve"=>504664, "ya"=>522393, "ch"=>73529, "fe"=>183011, "gy"=>223294, "ju"=>258735, "ci"=>81247, "po"=>336199, "sk"=>421199, "ic"=>240465, "sl"=>422915, "id"=>240617, "sm"=>427144, "vi"=>506088, "ye"=>522895, "cl"=>82562, "la"=>264691, "mu"=>299509, "sn"=>428534, "fi"=>185556, "pr"=>341164, "so"=>430811, "ig"=>240935, "ps"=>350290, "sp"=>433886, "co"=>87668, "pt"=>350435, "sq"=>440723, "yi"=>523082, "fl"=>189489, "le"=>268258, "my"=>301203, "pu"=>350501, "ob"=>306433, "vo"=>507446, "cr"=>110051, "oc"=>307497, "fo"=>194383, "od"=>307842, "st"=>442042, "il"=>241065, "li"=>271717, "ra"=>358635, "py"=>356507, "su"=>454237, "vr"=>508439, "cu"=>115910, "im"=>241391, "of"=>307921, "yo"=>523290, "ba"=>24810, "fr"=>199363, "in"=>244059, "og"=>308488, "sw"=>460901, "io"=>254520, "vu"=>508468, "ft"=>202432, "oi"=>308518, "re"=>363091, "sy"=>463729, "ea"=>159598, "cy"=>118743, "fu"=>202461, "ir"=>254766, "ok"=>308584, "be"=>30231, "eb"=>160312, "lo"=>275662, "rh"=>386786, "ec"=>160555, "is"=>255120, "om"=>308616, "ri"=>386952, "ed"=>160739, "it"=>255504, "on"=>308689, "ha"=>223396, "ug"=>495123, "bi"=>37946, "o."=>306373, "ef"=>161223, "oo"=>308723, "xe"=>522360, "eg"=>161607, "op"=>308863, "bl"=>39622, "he"=>228710, "ka"=>259787, "lu"=>279432, "or"=>309814, "ro"=>389309, "ej"=>161771, "os"=>310691, "ul"=>495155, "bo"=>44472, "ek"=>161875, "um"=>495274, "el"=>161933, "ke"=>259848, "na"=>301427, "ly"=>280367, "ou"=>310976, "un"=>495308, "em"=>162861, "hi"=>231023, "ov"=>313178, "e-"=>159560, "br"=>48932, "en"=>164868, "ow"=>317836, "up"=>501838},
10
16
  :adv => {"ul"=>146918, "sa"=>121999, "me"=>87949, "is"=>79726, "al"=>6456, "fu"=>54854, "ty"=>146844, "ro"=>121173, "op"=>100918, "it"=>79763, "am"=>8151, "ba"=>15505, "gi"=>56230, "va"=>156823, "un"=>147057, "sc"=>122613, "pe"=>105172, "an"=>8719, "cy"=>31883, "or"=>101364, "ea"=>40812, "i."=>64191, "up"=>155754, "se"=>123441, "os"=>101738, "mi"=>89536, "lu"=>85885, "eb"=>41346, "ap"=>10042, "bc"=>16615, "gl"=>56390, "ph"=>107333, "ot"=>101845, "ha"=>58556, "do"=>38706, "ec"=>41381, "ve"=>157238, "ur"=>156560, "pi"=>107876, "ou"=>101917, "o."=>96668, "ar"=>10688, "be"=>16664, "ed"=>41593, "us"=>156624, "sh"=>125358, "ov"=>102923, "go"=>56887, "as"=>11428, "ye"=>162287, "ut"=>156758, "si"=>126337, "ru"=>121807, "ow"=>103605, "ly"=>86271, "dr"=>39948, "ee"=>41743, "at"=>12932, "pl"=>108832, "na"=>92504, "he"=>60006, "ef"=>41771, "au"=>14641, "jo"=>80066, "vi"=>157846, "sk"=>127582, "bi"=>18176, "gr"=>57195, "eg"=>41998, "a."=>1802, "av"=>14980, "sl"=>127918, "pn"=>109745, "mo"=>90407, "ke"=>80998, "du"=>40494, "aw"=>15087, "yi"=>162508, "sm"=>128914, "po"=>109782, "ei"=>42035, "ax"=>15383, "ux"=>156789, "ta"=>138198, "sn"=>129359, "ne"=>92966, "gu"=>58424, "bl"=>18795, "so"=>129638, "ca"=>22029, "hi"=>61819, "wa"=>159164, "sp"=>132158, "dy"=>40699, "el"=>42063, "ju"=>80410, "ki"=>81028, "vo"=>158698, "sq"=>133295, "pr"=>111025, "em"=>42369, "fa"=>47339, "bo"=>19295, "te"=>138977, "ps"=>114778, "ni"=>94052, "mu"=>91756, "en"=>42630, "yo"=>162542, "e'"=>40734, "we"=>159590, "pu"=>114861, "p."=>103637, "ib"=>64217, "ep"=>43786, "br"=>19838, "ce"=>23083, "th"=>140092, "st"=>133477, "ic"=>64297, "eq"=>43822, "fe"=>48826, "cf"=>23412, "la"=>81356, "ho"=>62489, "ze"=>162691, "ti"=>142646, "su"=>135818, "my"=>92434, "id"=>64326, "er"=>43965, "kn"=>81226, "wh"=>160191, "vu"=>159098, "es"=>44216, "bu"=>20620, "ch"=>23461, "ie"=>64701, "wi"=>160770, "sw"=>137556, "py"=>115529, "ob"=>96696, "et"=>44370, "a_"=>1884, "b."=>15451, "ci"=>24468, "if"=>64725, "ra"=>116360, "oc"=>97499, "no"=>94335, "le"=>82562, "ig"=>64753, "eu"=>44719, "zi"=>162788, "sy"=>137715, "od"=>97535, "fi"=>49357, "e."=>40786, "ev"=>44758, "by"=>20853, "cl"=>24639, "hu"=>63384, "to"=>143107, "of"=>97605, "da"=>31998, "ex"=>45689, "ab"=>2343, "re"=>117290, "li"=>83328, "ac"=>3468, "fl"=>50352, "wo"=>161523, "ga"=>55429, "co"=>25439, "ad"=>4138, "tr"=>145364, "nu"=>96480, "hy"=>63857, "il"=>64855, "ae"=>5138, "rh"=>120391, "im"=>65172, "af"=>5207, "ja"=>79789, "fo"=>50969, "wr"=>162050, "ri"=>120500, "qu"=>115566, "ok"=>98535, "cr"=>30383, "de"=>32710, "in"=>67015, "ag"=>5747, "ma"=>86304, "'t"=>1740, "ah"=>6150, "ge"=>55713, "tu"=>146503, "om"=>98587, "ip"=>78991, "ai"=>6367, "fr"=>53467, "pa"=>103691, "on"=>98620, "lo"=>84939, "je"=>79886, "cu"=>31304, "tw"=>146716, "o'"=>96613, "di"=>35621, "ir"=>79023, "ak"=>6428, "c."=>21975}
11
17
  }
18
+
19
+ # Hash object used for caching retreved terms to further improve retreval performance
12
20
  WORDS_CACHE = Hash.new
13
21
 
14
- attr_reader :connected, :connection_type, :data_path, :wordnet_path
22
+ ## Returns the current connection status of the wordnet object.
23
+ #
24
+ # @return [true, false] The current connection status of the wordnet object.
25
+ attr_reader :connected
15
26
 
27
+ ## Returns the current connection status of the wordnet object.
28
+ #
29
+ # @return [true, false] The current connection status of the wordnet object.
30
+ alias :connected? connected
31
+
32
+ # Returns the type of the current wordnet connection.
33
+ #
34
+ # @return [Symbol] The current wordnet connection type. Currently supported :pure & :tokyo.
35
+ attr_reader :connection_type
36
+
37
+ # Returns the datapath currently in use (this may be irrelevent when using the pure connector and thus could be nil.)
38
+ #
39
+ # @return [Pathname, nil] The path to the data directory currently in use. Returns nil if unknown.
40
+ attr_reader :data_path
41
+
42
+ # Returns the path to the wordnet collection currently in use (this may be irrelevent when using the tokyo connector and thus could be nil.)
43
+ #
44
+ # @return [Pathname, nil] The path to the wordnet collection currently in use. Returns nil if unknown.
45
+ attr_reader :wordnet_path
46
+
47
+ # Constructs a new pure ruby connector for use with the words wordnet class.
48
+ #
49
+ # @param [Pathname] data_path Specifies the directory within which constructed datasets can be found (evocations etc...)
50
+ # @param [Pathname] wordnet_path Specifies the directory within which the wordnet dictionary can be found.
51
+ # @return [PureWordnetConnection] A new wordnet connection.
52
+ # @raise [BadWordnetConnector] If an invalid connector type is provided.
16
53
  def initialize(data_path, wordnet_path)
17
54
 
18
55
  @data_path, @wordnet_path, @connection_type, @connected = data_path, wordnet_path, :pure, false
@@ -21,6 +58,9 @@ module Words
21
58
 
22
59
  end
23
60
 
61
+ # Causes the connection specified within the wordnet object to be reopened if currently closed.
62
+ #
63
+ # @raise [BadWordnetConnector] If an invalid connector type is provided.
24
64
  def open!
25
65
 
26
66
  raise BadWordnetDataset, "Failed to locate the wordnet database. Please ensure it is installed and that if it resides at a custom path that path is given as an argument when constructing the Words object." if @wordnet_path.nil?
@@ -36,6 +76,8 @@ module Words
36
76
 
37
77
  end
38
78
 
79
+ # Causes the current connection to wordnet to be closed.
80
+ #
39
81
  def close!
40
82
 
41
83
  @connected = false
@@ -43,56 +85,29 @@ module Words
43
85
 
44
86
  end
45
87
 
46
- # main methods for connector
47
-
88
+ # Locates from a term any relevent homographs and constructs a homographs hash.
89
+ #
90
+ # @param [String] term The specific term that is desired from within wordnet.
91
+ # @param [true, false] use_cache Specify whether to use caching when finding and retreving terms.
92
+ # @result [Hash, nil] A hash in the format { 'lemma' => ..., 'tagsense_counts' => ..., 'synset_ids' => ... }, or nil if no homographs are available.
93
+ # @raise [NoWordnetConnection] If there is currently no wordnet connection.
48
94
  def homographs(term, use_cache = true)
49
95
 
50
96
  raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
51
97
 
52
- # clean up the term
53
- term = term.gsub(" ", "_").downcase
54
-
55
- # identify the term initials
56
- term_initials = term[0,2]
57
-
58
- # for each index we have
59
- INDEXES.keys.each do |index_pos|
60
- next unless INDEXES[index_pos].include? term_initials # if the index does not contain the desired word skip the index
61
- file = File.new(@wordnet_path + "index.#{index_pos}") # open wordnet index file
62
- file.seek INDEXES[index_pos][term_initials] # seek to the index starting point
63
-
64
- while (line = file.gets) && (term_initials == line[0,2]) # break if line if EOF or we are past the term and thus the line doesnt start with the term initials
65
- lemma, pos, *index_parts = line.split(' ') # split the line and split off the lemma
66
- if (lemma == term || use_cache) # if it's the term we are after or we are using cache then we save the word
67
- WORDS_CACHE[lemma] ||= [ lemma ] # ensure that there is datastructure to hold our word information
68
- if !WORDS_CACHE[lemma].include?(index_pos) # unless there already exists an entry for said word associated with the current index
69
- tagsense_count, *synset_offsets = index_parts.slice(index_parts[1].to_i+3..-1) # seperate out what is useful from the index as a whole
70
- WORDS_CACHE[lemma] += [ pos, tagsense_count.to_i, synset_offsets ] # add the tagsense_count and the synsets for the pos
71
- break if lemma == term # if we have the word in this index then we can jump out and check the next
72
- end
73
- end
74
- end
75
-
76
- file.close # close wordnet index file
77
- end unless WORDS_CACHE.include?(term) && use_cache # if we have the term already and are ok with using cache then simply use that!
98
+ # Ensure that the term is either in the cache. If not, locate and add it if possable.
99
+ cache_ensure_from_wordnet(term, use_cache)
78
100
 
79
- # we should either have the word in cache now or nowt... we should now change that into homograph input format (we do this here to improve performance during the cacheing performed above)
80
- lemma, *raw_homographs = WORDS_CACHE[term] # split the homograph
81
- unless raw_homographs.empty? # if we have something... format it
82
- tagsense_counts = Array.new
83
- synset_ids = Array.new
84
- while !raw_homographs.empty?
85
- pos = raw_homographs.shift
86
- tagsense_counts << "#{pos}#{raw_homographs.shift}"
87
- synset_ids += raw_homographs.shift.map { |sense_offset| "#{pos}#{sense_offset}" }
88
- end
89
- return { 'lemma' => lemma, 'tagsense_counts' => tagsense_counts.join('|'), 'synset_ids' => synset_ids.join('|') }
90
- else
91
- return nil # we return nil if we haven't found the term
92
- end
101
+ # We should either have the word in cache now or nowt... we should now change that into homograph input format (we do this here to improve performance during the cacheing performed above)
102
+ cached_entry_to_homograph_hash(term)
93
103
 
94
104
  end
95
105
 
106
+ # Locates from a synset_id a specific synset and constructs a synset hash.
107
+ #
108
+ # @param [String] synset_id The synset id to locate.
109
+ # @result [Hash, nil] A hash in the format { "synset_id" => ..., "lexical_filenum" => ..., "synset_type" => ..., "words" => ..., "relations" => ..., "gloss" => ... }, or nil if no synset is available.
110
+ # @raise [NoWordnetConnection] If there is currently no wordnet connection.
96
111
  def synset(synset_id)
97
112
 
98
113
  raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
@@ -109,18 +124,27 @@ module Words
109
124
 
110
125
  end
111
126
 
127
+ # Returns wheter evocations are currently avalable to use with the current wordnet object. (More information on setting these up can be found within the README)
128
+ #
129
+ # @return [true, false] Whether evocations are currently available or not.
112
130
  def evocations?
113
131
 
114
132
  !evocations('n08112402').nil?
115
133
 
116
134
  end
117
135
 
118
- def evocations(senset_id)
136
+ # Locates from a synset id any relevent evocations and constructs an evocations hash.
137
+ #
138
+ # @see Synset
139
+ # @param [String] senset_id The id number of a specific synset.
140
+ # @result [Hash, nil] A hash in the format { 'relations' => ..., 'means' => ..., 'medians' => ... }, or nil if no evocations are available.
141
+ # @raise [NoWordnetConnection] If there is currently no wordnet connection.
142
+ def evocations(synset_id)
119
143
 
120
144
  raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
121
145
 
122
146
  if defined? @evocations
123
- raw_evocations = @evocations[senset_id + "s"]
147
+ raw_evocations = @evocations[synset_id + "s"]
124
148
  { 'relations' => raw_evocations[0], 'means' => raw_evocations[1], 'medians' => raw_evocations[2]} unless raw_evocations.nil?
125
149
  else
126
150
  nil
@@ -128,15 +152,73 @@ module Words
128
152
 
129
153
  end
130
154
 
131
-
155
+ # Provides a textural description of the current connection state of the Wordnet object.
156
+ #
157
+ # @return [String] A textural description of the current connection state of the Wordnet object. e.g. "Words not Connected" or "Words running in pure mode using wordnet files found at /opt/wordnet"
132
158
  def to_s
133
159
 
134
160
  "Words running in pure mode using wordnet files found at #{wordnet_path}"
135
161
 
136
162
  end
137
163
 
138
- alias connected? connected
139
-
164
+ private
165
+
166
+ def cache_ensure_from_wordnet(term, use_cache)
167
+
168
+ # clean up the term
169
+ term = term.gsub(" ", "_").downcase
170
+
171
+ # identify the term initials
172
+ term_initials = term[0,2]
173
+
174
+ # for each index we have
175
+ INDEXES.keys.each do |index_pos|
176
+ next unless INDEXES[index_pos].include? term_initials # if the index does not contain the desired word skip the index
177
+ file = File.new(@wordnet_path + "index.#{index_pos}") # open wordnet index file
178
+ file.seek INDEXES[index_pos][term_initials] # seek to the index starting point
179
+
180
+ while (line = file.gets) && (term_initials == line[0,2]) # break if line if EOF or we are past the term and thus the line doesnt start with the term initials
181
+ break if construct_cache_item(line, term, use_cache, index_pos)
182
+ end
183
+
184
+ file.close # close wordnet index file
185
+ end unless WORDS_CACHE.include?(term) && use_cache # if we have the term already and are ok with using cache then simply use that!
186
+
187
+ end
188
+
189
+ def construct_cache_item(line, term, use_cache, index_pos)
190
+
191
+ lemma, pos, *index_parts = line.split(' ') # split the line and split off the lemma
192
+ if (lemma == term || use_cache) # if it's the term we are after or we are using cache then we save the word
193
+ WORDS_CACHE[lemma] ||= [ lemma ] # ensure that there is datastructure to hold our word information
194
+ if !WORDS_CACHE[lemma].include?(index_pos) # unless there already exists an entry for said word associated with the current index
195
+ tagsense_count, *synset_offsets = index_parts.slice(index_parts[1].to_i+3..-1) # seperate out what is useful from the index as a whole
196
+ WORDS_CACHE[lemma] += [ pos, tagsense_count.to_i, synset_offsets ] # add the tagsense_count and the synsets for the pos
197
+ return true if lemma == term # if we have the word in this index then we can jump out and check the next index
198
+ end
199
+ end
200
+ return false
201
+
202
+ end
203
+
204
+ def cached_entry_to_homograph_hash(term)
205
+
206
+ lemma, *raw_homographs = WORDS_CACHE[term] # split the homograph
207
+ unless raw_homographs.empty? # if we have something... format it
208
+ tagsense_counts = Array.new
209
+ synset_ids = Array.new
210
+ while !raw_homographs.empty?
211
+ pos = raw_homographs.shift
212
+ tagsense_counts << "#{pos}#{raw_homographs.shift}"
213
+ synset_ids += raw_homographs.shift.map { |sense_offset| "#{pos}#{sense_offset}" }
214
+ end
215
+ return { 'lemma' => lemma, 'tagsense_counts' => tagsense_counts.join('|'), 'synset_ids' => synset_ids.join('|') }
216
+ else
217
+ return nil # we return nil if we haven't found the term
218
+ end
219
+
220
+ end
221
+
140
222
  end
141
223
 
142
224
  end
@@ -1,16 +1,48 @@
1
+ # coding: utf-8
2
+
1
3
  # gem includes
2
4
  require 'rubygems'
3
5
  require 'rufus-tokyo' if Gem.available?('rufus-tokyo')
4
6
 
5
7
  module Words
6
8
 
9
+ # Provides a pure tokyo cabinate connector to the Wordnet dataset.
7
10
  class TokyoWordnetConnection
8
11
 
9
- attr_reader :connected, :connection_type, :data_path, :wordnet_path
10
-
12
+ ## Returns the current connection status of the wordnet object.
13
+ #
14
+ # @return [true, false] The current connection status of the wordnet object.
15
+ attr_reader :connected
16
+
17
+ ## Returns the current connection status of the wordnet object.
18
+ #
19
+ # @return [true, false] The current connection status of the wordnet object.
20
+ alias :connected? connected
21
+
22
+ # Returns the type of the current wordnet connection.
23
+ #
24
+ # @return [Symbol] The current wordnet connection type. Currently supported :pure & :tokyo.
25
+ attr_reader :connection_type
26
+
27
+ # Returns the datapath currently in use (this may be irrelevent when using the pure connector and thus could be nil.)
28
+ #
29
+ # @return [Pathname, nil] The path to the data directory currently in use. Returns nil if unknown.
30
+ attr_reader :data_path
31
+
32
+ # Returns the path to the wordnet collection currently in use (this may be irrelevent when using the tokyo connector and thus could be nil.)
33
+ #
34
+ # @return [Pathname, nil] The path to the wordnet collection currently in use. Returns nil if unknown.
35
+ attr_reader :wordnet_path
36
+
37
+ # Constructs a new tokyo ruby connector for use with the words wordnet class.
38
+ #
39
+ # @param [Pathname] data_path Specifies the directory within which constructed datasets can be found (tokyo index, evocations etc...)
40
+ # @param [Pathname] wordnet_path Specifies the directory within which the wordnet dictionary can be found.
41
+ # @return [PureWordnetConnection] A new wordnet connection.
42
+ # @raise [BadWordnetConnector] If an invalid connector type is provided.
11
43
  def initialize(data_path, wordnet_path)
12
44
 
13
- @data_path, @wordnet_path, @connection_type, @connected = data_path, wordnet_path, :tokyo, false
45
+ @data_path, @wordnet_path, @connection_type, @connected = data_path + 'wordnet.tct', wordnet_path, :tokyo, false
14
46
 
15
47
  # ensure we have the rufus gem loaded, else there is little point in continuing...
16
48
  raise BadWordnetConnector, "Coulden't find the rufus-tokyo gem. Please ensure it's installed." unless Gem.available?('rufus-tokyo')
@@ -19,20 +51,26 @@ module Words
19
51
 
20
52
  end
21
53
 
54
+ # Causes the connection specified within the wordnet object to be reopened if currently closed.
55
+ #
56
+ # @raise [BadWordnetConnector] If an invalid connector type is provided.
22
57
  def open!
23
58
 
24
- @dataset_path = @data_path + 'wordnet.tct'
25
- if @dataset_path.exist?
26
- @connection = Rufus::Tokyo::Table.new(@dataset_path.to_s, :mode => 'r')
27
- @connected = true
28
- else
29
- @connected = false
30
- raise BadWordnetDataset, "Failed to locate the tokyo words dataset at #{@dataset_path}. Please insure you have created it using the words gems provided 'build_wordnet' command."
59
+ unless connected?
60
+ if @data_path.exist?
61
+ @connection = Rufus::Tokyo::Table.new(@data_path.to_s, :mode => 'r')
62
+ @connected = true
63
+ else
64
+ @connected = false
65
+ raise BadWordnetDataset, "Failed to locate the tokyo words dataset at #{@data_path}. Please insure you have created it using the words gems provided 'build_wordnet' command."
66
+ end
31
67
  end
32
68
  return nil
33
69
 
34
70
  end
35
71
 
72
+ # Causes the current connection to wordnet to be closed.
73
+ #
36
74
  def close!
37
75
 
38
76
  if connected?
@@ -43,15 +81,23 @@ module Words
43
81
 
44
82
  end
45
83
 
46
- # main methods for connector
47
-
84
+ # Locates from a term any relevent homographs and constructs a homographs hash.
85
+ #
86
+ # @param [String] term The specific term that is desired from within wordnet.
87
+ # @result [Hash, nil] A hash in the format { 'lemma' => ..., 'tagsense_counts' => ..., 'synset_ids' => ... }, or nil if no homographs are available.
88
+ # @raise [NoWordnetConnection] If there is currently no wordnet connection.
48
89
  def homographs(term)
49
90
 
50
91
  raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
51
92
  @connection[term]
52
93
 
53
94
  end
54
-
95
+
96
+ # Locates from a synset_id a specific synset and constructs a synset hash.
97
+ #
98
+ # @param [String] synset_id The synset id to locate.
99
+ # @result [Hash, nil] A hash in the format { "synset_id" => ..., "lexical_filenum" => ..., "synset_type" => ..., "words" => ..., "relations" => ..., "gloss" => ... }, or nil if no synset is available.
100
+ # @raise [NoWordnetConnection] If there is currently no wordnet connection.
55
101
  def synset(synset_id)
56
102
 
57
103
  raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
@@ -59,27 +105,37 @@ module Words
59
105
 
60
106
  end
61
107
 
108
+ # Returns wheter evocations are currently avalable to use with the current wordnet object. (More information on setting these up can be found within the README)
109
+ #
110
+ # @return [true, false] Whether evocations are currently available or not.
62
111
  def evocations?
63
112
 
64
113
  !evocations('n08112402').nil?
65
114
 
66
115
  end
67
116
 
68
- def evocations(senset_id)
117
+ # Locates from a synset id any relevent evocations and constructs an evocations hash.
118
+ #
119
+ # @see Synset
120
+ # @param [String] senset_id The id number of a specific synset.
121
+ # @result [Hash, nil] A hash in the format { 'relations' => ..., 'means' => ..., 'medians' => ... }, or nil if no evocations are available.
122
+ # @raise [NoWordnetConnection] If there is currently no wordnet connection.
123
+ def evocations(synset_id)
69
124
 
70
125
  raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
71
- @connection[senset_id + "s"]
126
+ @connection[synset_id + "s"]
72
127
 
73
128
  end
74
129
 
130
+ # Provides a textural description of the current connection state of the Wordnet object.
131
+ #
132
+ # @return [String] A textural description of the current connection state of the Wordnet object. e.g. "Words not Connected" or "Words running in tokyo mode with dataset at /opt/wordnet"
75
133
  def to_s
76
134
 
77
135
  "Words running in tokyo mode with dataset at #{@dataset_path}"
78
136
 
79
137
  end
80
138
 
81
- alias connected? connected
82
-
83
139
  end
84
140
 
85
141
  end
data/lib/words.rb CHANGED
@@ -1,49 +1,67 @@
1
- # std library includes
1
+ # coding: utf-8
2
+
3
+ # Standard library includes
2
4
  require 'pathname'
3
5
 
4
- # local includes
6
+ # Local includes
5
7
  require File.join(File.dirname(__FILE__),'homographs.rb')
6
8
 
9
+ # The Words gem namespace. Within this we offer a number of classes to facilitate useful interaction with words and language. Currently this largly consists of Words::Wordnet which offers simple wordnet access.
7
10
  module Words
8
11
 
9
- # we identify each wordnet connector installed and there paths
12
+ # We identify each wordnet connector installed and there paths
10
13
  SUPPORTED_CONNECTIORS = Dir[File.join(File.dirname(__FILE__),'wordnet_connectors','*_wordnet_connection.rb')].inject(Hash.new) { |connectors, connection_file| connectors[ File.basename(connection_file).split('_').first.to_sym ] = connection_file; connectors }
14
+ # An array of tippical wordnet install locations (if you have a standard install somewhere else please open as an issue in github so we can improve!)
11
15
  DEFAULT_WORDNET_LOCATIONS = ['/usr/share/wordnet', '/usr/local/share/wordnet', '/usr/local/WordNet-3.0', '/opt/WordNet-3.0', '/opt/wordnet', '/opt/local/share/WordNet-3.0/']
12
16
 
13
- # specify some useful exception types
17
+ # Exception to indicate that the wordnet connector specified is not currently available/supported.
14
18
  class BadWordnetConnector < RuntimeError; end
19
+ # Exception to indicate that there is a problem connecting to a specified wordnet dataset.
15
20
  class BadWordnetDataset < RuntimeError; end
21
+ # Exception to indicate that there is not currently a connection to wordnet and thus any request cannot be fulfilled.
16
22
  class NoWordnetConnection < RuntimeError; end
17
23
 
18
- # specify the wordnet control object
24
+ # The wordnet class provides a control come interface for interaction with the wordnet dataset of your choice. It creates a connection, based on specified paramaters, to a wordnet dataset and provides
25
+ # the means to interigate that dataset. In addition it provides control and information about that wordnet connection.
19
26
  class Wordnet
20
27
 
28
+ ## Returns the underlying wordnet connection object.
29
+ #
30
+ # @return [PureWordnetConnection, TokyoWordnetConnection] the underlying wordnet connection object.
21
31
  attr_reader :wordnet_connection
22
-
32
+
33
+ # Constructs a new wordnet connection object.
34
+ #
35
+ # @param [Symbol] connector_type Specifies the connector type or mode desired. Current supported connectors are :pure and :tokyo.
36
+ # @param [String, Symbol] wordnet_path Specifies the directory within which the wordnet dictionary can be found. It can be set to :search to attempt to locate wordnet automatically.
37
+ # @param [String, Symbol] data_path Specifies the directory within which constructed datasets can be found (tokyo index, evocations etc...) It can be set to :default to use the standard location inside the gem directory.
38
+ # @return [Wordnet] The wordnet connection object.
39
+ # @raise [BadWordnetConnector] If an invalid connector type is provided.
23
40
  def initialize(connector_type = :pure, wordnet_path = :search, data_path = :default)
24
41
 
25
- # check and specify useful paths
42
+ # Check and specify useful paths
26
43
  wordnet_path = Wordnet::locate_wordnet(wordnet_path)
27
44
  data_path = (data_path == :default ? Pathname.new(File.join(File.dirname(__FILE__), '..', 'data')) : Pathname.new( data_path ))
28
45
 
29
- # ensure we have a valid connector type
46
+ # Ensure we have a valid connector type
30
47
  raise BadWordnetConnector, "You specified an unsupported wordnet connector type. Supported connectors are: #{SUPPORTED_CONNECTIORS}" unless SUPPORTED_CONNECTIORS.include? connector_type
31
48
 
32
- # assuming we have a valid connection type we can import the relevant code (the reason we do this dynamically is to reduce loadtime)
33
- require SUPPORTED_CONNECTIORS[connector_type]
49
+ # We can assume that the disired connector is now available
50
+ desired_connector = SUPPORTED_CONNECTIORS[connector_type]
51
+
52
+ # Assuming we have a valid connection type we can import the relevant code (the reason we do this dynamically is to reduce loadtime)
53
+ require desired_connector
34
54
 
35
- # construct the connector object
36
- @wordnet_connection = Words.const_get( File.basename(SUPPORTED_CONNECTIORS[connector_type], '.rb').gsub(/(^|_)(.)/) { $2.upcase } ).new(data_path, wordnet_path)
55
+ # Construct the connector object
56
+ @wordnet_connection = Words.const_get( File.basename(desired_connector, '.rb').gsub(/(^|_)(.)/) { $2.upcase } ).new(data_path, wordnet_path)
37
57
 
38
- # construct some conveniance menthods for relation type access
39
- [:connection_type, :wordnet_path, :data_path, :close!, :open!, :connected?, :evocations?].each do |method_name|
40
- self.class.send(:define_method, method_name) do
41
- @wordnet_connection.send method_name if defined? @wordnet_connection
42
- end
43
- end
44
-
45
58
  end
46
59
 
60
+ # Locates the set of homographs within wordnet specific to the term entered.
61
+ #
62
+ # @param [String] term The specific term that is desired from within wordnet. This is caps insensative & we do a small amount of cleanup.
63
+ # @return [Homographs] An object encaptulating the homographs of the desired term. If the term cannot be located within wordnet then nil is returned.
64
+ # @raise [NoWordnetConnection] If there is currently no wordnet connection.
47
65
  def find(term)
48
66
 
49
67
  raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
@@ -51,23 +69,91 @@ module Words
51
69
  Homographs.new(homographs, @wordnet_connection) unless homographs.nil?
52
70
 
53
71
  end
54
-
72
+
73
+ # Returns the type of the current wordnet connection.
74
+ #
75
+ # @return [Symbol] The current wordnet connection type. Currently supported :pure & :tokyo.
76
+ def connection_type
77
+
78
+ @wordnet_connection.connection_type
79
+
80
+ end
81
+
82
+ # Returns the path to the wordnet collection currently in use (this may be irrelevent when using the tokyo connector and thus could be nil.)
83
+ #
84
+ # @return [Pathname, nil] The path to the wordnet collection currently in use. Returns nil if unknown.
85
+ def wordnet_path
86
+
87
+ @wordnet_connection.wordnet_path
88
+
89
+ end
90
+
91
+ # Returns the datapath currently in use (this may be irrelevent when using the pure connector and thus could be nil.)
92
+ #
93
+ # @return [Pathname, nil] The path to the data directory currently in use. Returns nil if unknown.
94
+ def data_path
95
+
96
+ @wordnet_connection.data_path
97
+
98
+ end
99
+
100
+ # Causes the current connection to wordnet to be closed.
101
+ #
102
+ def close!
103
+
104
+ @wordnet_connection.close!
105
+
106
+ end
107
+
108
+ # Causes the connection specified within the wordnet object to be reopened if currently closed.
109
+ #
110
+ def open!
111
+
112
+ @wordnet_connection.open!
113
+
114
+ end
115
+
116
+ # Returns the current connection status of the wordnet object.
117
+ #
118
+ # @return [true, false] The current connection status of the wordnet object.
119
+ def connected?
120
+
121
+ @wordnet_connection.connected?
122
+
123
+ end
124
+
125
+ # Returns wheter evocations are currently avalable to use with the current wordnet object. (More information on setting these up can be found within the README)
126
+ #
127
+ # @return [true, false] Whether evocations are currently available or not.
128
+ def evocations?
129
+
130
+ @wordnet_connection.evocations?
131
+
132
+ end
133
+
134
+ # Provides a textural description of the current connection state of the Wordnet object.
135
+ #
136
+ # @return [String] A textural description of the current connection state of the Wordnet object. e.g. "Words not Connected" or "Words running in pure mode using wordnet files found at /opt/wordnet"
55
137
  def to_s
56
138
 
57
- # return a description of the connector
139
+ # Return a description of the connector
58
140
  !connected? ? "Words not connected" : @wordnet_connection.to_s
59
141
 
60
142
  end
61
143
 
62
144
  private
63
145
 
146
+ # Attempts to locates wordnet given an array of directories to look within
147
+ #
148
+ # @param [String, Array<String>, Symbol] base_dirs Either a path, array of or the :search symbol. Will attempt to locate wordnet within these specified directories.
149
+ # @return [Pathname, nil] The pathname of the wordnet dictionary files or nil if they can't be located within the passed directorie(s)
64
150
  def self.locate_wordnet(base_dirs)
65
151
 
66
152
  base_dirs = case base_dirs
67
153
  when :search
68
154
  DEFAULT_WORDNET_LOCATIONS
69
155
  else
70
- [ base_dirs ]
156
+ [ base_dirs ].flatten
71
157
  end
72
158
 
73
159
  base_dirs.each do |dir|