biblicit 1.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (406) hide show
  1. data/.gitmodules +3 -0
  2. data/Gemfile +1 -1
  3. data/README.md +125 -30
  4. data/Rakefile +22 -0
  5. data/biblicit.gemspec +9 -7
  6. data/lib/biblicit/cb2bib.rb +10 -11
  7. data/lib/biblicit/citeseer.rb +14 -26
  8. data/lib/biblicit/extractor.rb +40 -19
  9. data/lib/biblicit/parscit.rb +38 -0
  10. data/parscit/.gitignore +8 -0
  11. data/parscit/CHANGELOG +125 -0
  12. data/parscit/COPYING +674 -0
  13. data/parscit/COPYING.LESSER +165 -0
  14. data/parscit/INSTALL +105 -0
  15. data/parscit/README +97 -0
  16. data/{perl/ParsCit/README.TXT → parscit/USAGE} +25 -15
  17. data/parscit/bin/archtest.pl +31 -0
  18. data/parscit/bin/citeExtract.pl +562 -0
  19. data/parscit/bin/conlleval.pl +315 -0
  20. data/parscit/bin/headExtract.pl +40 -0
  21. data/parscit/bin/parsHed/convert2TokenLevel.pl +138 -0
  22. data/parscit/bin/parsHed/keywordGen.pl +308 -0
  23. data/parscit/bin/parsHed/parseXmlHeader.pl +141 -0
  24. data/parscit/bin/parsHed/redo.parsHed.pl +198 -0
  25. data/parscit/bin/parsHed/tr2crfpp_parsHed.pl +521 -0
  26. data/parscit/bin/parseRefStrings.pl +102 -0
  27. data/parscit/bin/phOutput2xml.pl +223 -0
  28. data/parscit/bin/redo.parsCit.pl +105 -0
  29. data/parscit/bin/sectExtract.pl +149 -0
  30. data/parscit/bin/sectLabel/README +110 -0
  31. data/parscit/bin/sectLabel/README.txt +110 -0
  32. data/parscit/bin/sectLabel/genericSect/crossValidation.rb +98 -0
  33. data/parscit/bin/sectLabel/genericSect/extractFeature.rb +104 -0
  34. data/parscit/bin/sectLabel/genericSectExtract.rb +53 -0
  35. data/parscit/bin/sectLabel/getStructureInfo.pl +156 -0
  36. data/parscit/bin/sectLabel/processOmniXML.pl +1427 -0
  37. data/parscit/bin/sectLabel/processOmniXML_new.pl +1025 -0
  38. data/parscit/bin/sectLabel/processOmniXMLv2.pl +1529 -0
  39. data/parscit/bin/sectLabel/processOmniXMLv3.pl +964 -0
  40. data/parscit/bin/sectLabel/redo.sectLabel.pl +219 -0
  41. data/parscit/bin/sectLabel/simplifyOmniXML.pl +382 -0
  42. data/parscit/bin/sectLabel/single2multi.pl +190 -0
  43. data/parscit/bin/sectLabel/tr2crfpp.pl +158 -0
  44. data/parscit/bin/tr2crfpp.pl +260 -0
  45. data/parscit/bin/xml2train.pl +193 -0
  46. data/parscit/lib/CSXUtil/SafeText.pm +130 -0
  47. data/parscit/lib/Omni/Config.pm +93 -0
  48. data/parscit/lib/Omni/Omnicell.pm +263 -0
  49. data/parscit/lib/Omni/Omnicol.pm +292 -0
  50. data/parscit/lib/Omni/Omnidd.pm +328 -0
  51. data/parscit/lib/Omni/Omnidoc.pm +153 -0
  52. data/parscit/lib/Omni/Omniframe.pm +223 -0
  53. data/parscit/lib/Omni/Omniline.pm +423 -0
  54. data/parscit/lib/Omni/Omnipage.pm +282 -0
  55. data/parscit/lib/Omni/Omnipara.pm +232 -0
  56. data/parscit/lib/Omni/Omnirun.pm +303 -0
  57. data/parscit/lib/Omni/Omnitable.pm +336 -0
  58. data/parscit/lib/Omni/Omniword.pm +162 -0
  59. data/parscit/lib/Omni/Traversal.pm +313 -0
  60. data/parscit/lib/ParsCit/.PostProcess.pm.swp +0 -0
  61. data/parscit/lib/ParsCit/Citation.pm +737 -0
  62. data/parscit/lib/ParsCit/CitationContext.pm +220 -0
  63. data/parscit/lib/ParsCit/Config.pm +35 -0
  64. data/parscit/lib/ParsCit/Controller.pm +653 -0
  65. data/parscit/lib/ParsCit/PostProcess.pm +505 -0
  66. data/parscit/lib/ParsCit/PreProcess.pm +1041 -0
  67. data/parscit/lib/ParsCit/Tr2crfpp.pm +1195 -0
  68. data/parscit/lib/ParsHed/Config.pm +49 -0
  69. data/parscit/lib/ParsHed/Controller.pm +143 -0
  70. data/parscit/lib/ParsHed/PostProcess.pm +322 -0
  71. data/parscit/lib/ParsHed/Tr2crfpp.pm +448 -0
  72. data/{perl/ParsCit/lib/ParsCit/Tr2crfpp.pm → parscit/lib/ParsHed/Tr2crfpp_token.pm} +22 -21
  73. data/parscit/lib/SectLabel/AAMatching.pm +1949 -0
  74. data/parscit/lib/SectLabel/Config.pm +88 -0
  75. data/parscit/lib/SectLabel/Controller.pm +332 -0
  76. data/parscit/lib/SectLabel/PostProcess.pm +425 -0
  77. data/parscit/lib/SectLabel/PreProcess.pm +116 -0
  78. data/parscit/lib/SectLabel/Tr2crfpp.pm +1246 -0
  79. data/parscit/resources/parsCit.model +0 -0
  80. data/parscit/resources/parsCit.split.model +0 -0
  81. data/{perl/ParsCit → parscit}/resources/parsCitDict.txt +205 -0
  82. data/parscit/resources/parsHed/bigram +10 -0
  83. data/parscit/resources/parsHed/keywords +10 -0
  84. data/parscit/resources/parsHed/parsHed.model +0 -0
  85. data/parscit/resources/parsHed/parsHed.template +178 -0
  86. data/parscit/resources/sectLabel/affiliation.model +0 -0
  87. data/parscit/resources/sectLabel/author.model +0 -0
  88. data/parscit/resources/sectLabel/funcWord +320 -0
  89. data/parscit/resources/sectLabel/genericSect.model +0 -0
  90. data/parscit/resources/sectLabel/sectLabel.config +42 -0
  91. data/parscit/resources/sectLabel/sectLabel.configXml +42 -0
  92. data/parscit/resources/sectLabel/sectLabel.model +0 -0
  93. data/sh/convert_to_text.sh +20 -0
  94. data/spec/biblicit/extractor_spec.rb +121 -0
  95. data/spec/fixtures/Review_of_Michael_Tyes_Consciousness_Revisited.docx +0 -0
  96. data/spec/fixtures/critical-infrastructures.ps +63951 -0
  97. data/spec/fixtures/txt/E06-1050.txt +867 -0
  98. data/spec/fixtures/txt/sample1.txt +902 -0
  99. data/spec/fixtures/txt/sample2.txt +394 -0
  100. data/spec/spec_helper.rb +3 -0
  101. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Function.pm +2 -20
  102. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/MultiClassChunking.pm +0 -7
  103. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/Parser.pm +0 -2
  104. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/ParserMethods.pm +0 -7
  105. data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/Config/API_Config.pm +6 -1
  106. data/svm-header-parse/HeaderParseService/tmp/.gitignore +4 -0
  107. data/svm-header-parse/extract.pl +75 -0
  108. metadata +351 -317
  109. data/perl/DocFilter/lib/DocFilter/Config.pm +0 -35
  110. data/perl/DocFilter/lib/DocFilter/Filter.pm +0 -51
  111. data/perl/FileConversionService/README.TXT +0 -11
  112. data/perl/FileConversionService/converters/PDFBox/pdfbox-app-1.7.1.jar +0 -0
  113. data/perl/FileConversionService/lib/CSXUtil/SafeText.pm +0 -140
  114. data/perl/FileConversionService/lib/FileConverter/CheckSum.pm +0 -77
  115. data/perl/FileConversionService/lib/FileConverter/Compression.pm +0 -137
  116. data/perl/FileConversionService/lib/FileConverter/Config.pm +0 -57
  117. data/perl/FileConversionService/lib/FileConverter/Controller.pm +0 -191
  118. data/perl/FileConversionService/lib/FileConverter/JODConverter.pm +0 -61
  119. data/perl/FileConversionService/lib/FileConverter/PDFBox.pm +0 -69
  120. data/perl/FileConversionService/lib/FileConverter/PSConverter.pm +0 -69
  121. data/perl/FileConversionService/lib/FileConverter/PSToText.pm +0 -88
  122. data/perl/FileConversionService/lib/FileConverter/Prescript.pm +0 -68
  123. data/perl/FileConversionService/lib/FileConverter/TET.pm +0 -75
  124. data/perl/FileConversionService/lib/FileConverter/Utils.pm +0 -130
  125. data/perl/HeaderParseService/lib/CSXUtil/SafeText.pm +0 -140
  126. data/perl/HeaderParseService/resources/data/EbizHeaders.txt +0 -24330
  127. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed +0 -27506
  128. data/perl/HeaderParseService/resources/data/EbizHeaders.txt.parsed.old +0 -26495
  129. data/perl/HeaderParseService/resources/data/tagged_headers.txt +0 -40668
  130. data/perl/HeaderParseService/resources/data/test_header.txt +0 -31
  131. data/perl/HeaderParseService/resources/data/test_header.txt.parsed +0 -31
  132. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test1 +0 -23
  133. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test10 +0 -23
  134. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test11 +0 -23
  135. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test12 +0 -23
  136. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test13 +0 -23
  137. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test14 +0 -23
  138. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test15 +0 -23
  139. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test2 +0 -23
  140. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test3 +0 -23
  141. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test4 +0 -23
  142. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test5 +0 -23
  143. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test6 +0 -23
  144. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test7 +0 -23
  145. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test8 +0 -23
  146. data/perl/HeaderParseService/tmp/tmpVec_1156237246.08016_test9 +0 -23
  147. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test1 +0 -23
  148. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test10 +0 -23
  149. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test11 +0 -23
  150. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test12 +0 -23
  151. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test13 +0 -23
  152. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test14 +0 -23
  153. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test15 +0 -23
  154. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test2 +0 -23
  155. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test3 +0 -23
  156. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test4 +0 -23
  157. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test5 +0 -23
  158. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test6 +0 -23
  159. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test7 +0 -23
  160. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test8 +0 -23
  161. data/perl/HeaderParseService/tmp/tmpVec_914027525.276114_test9 +0 -23
  162. data/perl/ParsCit/crfpp/traindata/parsCit.template +0 -60
  163. data/perl/ParsCit/crfpp/traindata/parsCit.train.data +0 -12104
  164. data/perl/ParsCit/crfpp/traindata/tagged_references.txt +0 -500
  165. data/perl/ParsCit/lib/CSXUtil/SafeText.pm +0 -140
  166. data/perl/ParsCit/lib/ParsCit/Citation.pm +0 -462
  167. data/perl/ParsCit/lib/ParsCit/CitationContext.pm +0 -132
  168. data/perl/ParsCit/lib/ParsCit/Config.pm +0 -46
  169. data/perl/ParsCit/lib/ParsCit/Controller.pm +0 -306
  170. data/perl/ParsCit/lib/ParsCit/PostProcess.pm +0 -367
  171. data/perl/ParsCit/lib/ParsCit/PreProcess.pm +0 -333
  172. data/perl/ParsCit/resources/parsCit.model +0 -0
  173. data/perl/extract.pl +0 -199
  174. data/spec/biblicit/cb2bib_spec.rb +0 -48
  175. data/spec/biblicit/citeseer_spec.rb +0 -40
  176. /data/{perl → svm-header-parse}/HeaderParseService/README.TXT +0 -0
  177. /data/{perl/DocFilter → svm-header-parse/HeaderParseService}/lib/CSXUtil/SafeText.pm +0 -0
  178. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/AssembleXMLMetadata.pm +0 -0
  179. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/LoadInformation.pm +0 -0
  180. /data/{perl → svm-header-parse}/HeaderParseService/lib/HeaderParse/API/NamePatternMatch.pm +0 -0
  181. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/50states +0 -0
  182. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AddrTopWords.txt +0 -0
  183. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWords.txt +0 -0
  184. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/AffiTopWordsAll.txt +0 -0
  185. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/ChineseSurNames.txt +0 -0
  186. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames.bin +0 -0
  187. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/Csurnames_spec.bin +0 -0
  188. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/DomainSuffixes.txt +0 -0
  189. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/LabeledHeader +0 -0
  190. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/README +0 -0
  191. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines +0 -0
  192. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/TrainMulClassLines1 +0 -0
  193. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstract.txt +0 -0
  194. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/abstractTopWords +0 -0
  195. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/addr.txt +0 -0
  196. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affi.txt +0 -0
  197. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/affis.bin +0 -0
  198. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/all_namewords_spec.bin +0 -0
  199. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/allnamewords.bin +0 -0
  200. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_US.txt +0 -0
  201. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cities_world.txt +0 -0
  202. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/city.txt +0 -0
  203. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/cityname.txt +0 -0
  204. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/country_abbr.txt +0 -0
  205. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/countryname.txt +0 -0
  206. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/dateTopWords +0 -0
  207. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/degree.txt +0 -0
  208. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/email.txt +0 -0
  209. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/excludeWords.txt +0 -0
  210. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/female-names +0 -0
  211. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstNames.txt +0 -0
  212. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames.bin +0 -0
  213. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/firstnames_spec.bin +0 -0
  214. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/intro.txt +0 -0
  215. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keyword.txt +0 -0
  216. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/keywordTopWords +0 -0
  217. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/male-names +0 -0
  218. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/middleNames.txt +0 -0
  219. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/month.txt +0 -0
  220. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul +0 -0
  221. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label +0 -0
  222. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.label.old +0 -0
  223. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mul.processed +0 -0
  224. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulAuthor +0 -0
  225. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/mulClassStat +0 -0
  226. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nickname.txt +0 -0
  227. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/nicknames.bin +0 -0
  228. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/note.txt +0 -0
  229. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/page.txt +0 -0
  230. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/phone.txt +0 -0
  231. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/postcode.txt +0 -0
  232. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/pubnum.txt +0 -0
  233. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.bin +0 -0
  234. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/statename.txt +0 -0
  235. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/states_and_abbreviations.txt +0 -0
  236. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords +0 -0
  237. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/stopwords.bin +0 -0
  238. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surNames.txt +0 -0
  239. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames.bin +0 -0
  240. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/surnames_spec.bin +0 -0
  241. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/A.html +0 -0
  242. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/B.html +0 -0
  243. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/C.html +0 -0
  244. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/D.html +0 -0
  245. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/E.html +0 -0
  246. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/F.html +0 -0
  247. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/G.html +0 -0
  248. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/H.html +0 -0
  249. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/I.html +0 -0
  250. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/J.html +0 -0
  251. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/K.html +0 -0
  252. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/L.html +0 -0
  253. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/M.html +0 -0
  254. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/N.html +0 -0
  255. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/O.html +0 -0
  256. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/P.html +0 -0
  257. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Q.html +0 -0
  258. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/R.html +0 -0
  259. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/S.html +0 -0
  260. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/T.html +0 -0
  261. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/U.html +0 -0
  262. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/V.html +0 -0
  263. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/W.html +0 -0
  264. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/WCSelect.gif +0 -0
  265. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/X.html +0 -0
  266. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Y.html +0 -0
  267. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/Z.html +0 -0
  268. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ae.html +0 -0
  269. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/am.html +0 -0
  270. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ar.html +0 -0
  271. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/at.html +0 -0
  272. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/au.html +0 -0
  273. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bd.html +0 -0
  274. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/be.html +0 -0
  275. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bg.html +0 -0
  276. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bh.html +0 -0
  277. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/blueribbon.gif +0 -0
  278. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bm.html +0 -0
  279. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/bn.html +0 -0
  280. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/br.html +0 -0
  281. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ca.html +0 -0
  282. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ch.html +0 -0
  283. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cl.html +0 -0
  284. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cn.html +0 -0
  285. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/co.html +0 -0
  286. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cr.html +0 -0
  287. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cy.html +0 -0
  288. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/cz.html +0 -0
  289. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/de.html +0 -0
  290. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dean-mainlink.jpg +0 -0
  291. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/dk.html +0 -0
  292. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ec.html +0 -0
  293. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ee.html +0 -0
  294. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/eg.html +0 -0
  295. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/es.html +0 -0
  296. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/et.html +0 -0
  297. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/faq.html +0 -0
  298. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fi.html +0 -0
  299. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fj.html +0 -0
  300. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fo.html +0 -0
  301. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/fr.html +0 -0
  302. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/geog.html +0 -0
  303. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gr.html +0 -0
  304. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/gu.html +0 -0
  305. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hk.html +0 -0
  306. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hr.html +0 -0
  307. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/hu.html +0 -0
  308. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/id.html +0 -0
  309. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ie.html +0 -0
  310. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/il.html +0 -0
  311. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/in.html +0 -0
  312. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/is.html +0 -0
  313. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/it.html +0 -0
  314. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jm.html +0 -0
  315. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jo.html +0 -0
  316. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/jp.html +0 -0
  317. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kaplan.gif +0 -0
  318. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kr.html +0 -0
  319. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/kw.html +0 -0
  320. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lb.html +0 -0
  321. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/linkbw2.gif +0 -0
  322. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lk.html +0 -0
  323. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lt.html +0 -0
  324. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lu.html +0 -0
  325. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/lv.html +0 -0
  326. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ma.html +0 -0
  327. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/maczynski.gif +0 -0
  328. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mirror.tar +0 -0
  329. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mk.html +0 -0
  330. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mo.html +0 -0
  331. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mseawdm.gif +0 -0
  332. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mt.html +0 -0
  333. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/mx.html +0 -0
  334. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/my.html +0 -0
  335. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ni.html +0 -0
  336. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nl.html +0 -0
  337. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/no.html +0 -0
  338. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/nz.html +0 -0
  339. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pa.html +0 -0
  340. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pe.html +0 -0
  341. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ph.html +0 -0
  342. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pl.html +0 -0
  343. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pointcom.gif +0 -0
  344. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pr.html +0 -0
  345. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ps.html +0 -0
  346. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/pt.html +0 -0
  347. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/recognition.html +0 -0
  348. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/results.html +0 -0
  349. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ro.html +0 -0
  350. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ru.html +0 -0
  351. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sd.html +0 -0
  352. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/se.html +0 -0
  353. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sg.html +0 -0
  354. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/si.html +0 -0
  355. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/sk.html +0 -0
  356. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/th.html +0 -0
  357. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tr.html +0 -0
  358. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/tw.html +0 -0
  359. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ua.html +0 -0
  360. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uk.html +0 -0
  361. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ-full.html +0 -0
  362. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/univ.html +0 -0
  363. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/uy.html +0 -0
  364. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/ve.html +0 -0
  365. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/yu.html +0 -0
  366. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/za.html +0 -0
  367. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list/zm.html +0 -0
  368. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/university_list.txt +0 -0
  369. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/url.txt +0 -0
  370. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/webTopWords +0 -0
  371. /data/{perl → svm-header-parse}/HeaderParseService/resources/database/words +0 -0
  372. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10ContextModelfold1 +0 -0
  373. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/10Modelfold1 +0 -0
  374. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11ContextModelfold1 +0 -0
  375. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/11Modelfold1 +0 -0
  376. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12ContextModelfold1 +0 -0
  377. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/12Modelfold1 +0 -0
  378. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13ContextModelfold1 +0 -0
  379. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/13Modelfold1 +0 -0
  380. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14ContextModelfold1 +0 -0
  381. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/14Modelfold1 +0 -0
  382. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15ContextModelfold1 +0 -0
  383. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/15Modelfold1 +0 -0
  384. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1ContextModelfold1 +0 -0
  385. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/1Modelfold1 +0 -0
  386. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2ContextModelfold1 +0 -0
  387. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/2Modelfold1 +0 -0
  388. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3ContextModelfold1 +0 -0
  389. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/3Modelfold1 +0 -0
  390. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4ContextModelfold1 +0 -0
  391. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/4Modelfold1 +0 -0
  392. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5ContextModelfold1 +0 -0
  393. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/5Modelfold1 +0 -0
  394. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6ContextModelfold1 +0 -0
  395. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/6Modelfold1 +0 -0
  396. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7ContextModelfold1 +0 -0
  397. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/7Modelfold1 +0 -0
  398. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8ContextModelfold1 +0 -0
  399. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/8Modelfold1 +0 -0
  400. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9ContextModelfold1 +0 -0
  401. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/9Modelfold1 +0 -0
  402. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceModel +0 -0
  403. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/NameSpaceTrainF +0 -0
  404. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperBaseFeaDict +0 -0
  405. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperContextFeaDict +0 -0
  406. /data/{perl → svm-header-parse}/HeaderParseService/resources/models/WrapperSpaceAuthorFeaDict +0 -0
@@ -0,0 +1,1949 @@
1
+ package SectLabel::AAMatching;
2
+
3
+ ###
4
+ # This package provides methods to solve the matching problem
5
+ # between author and affiliation in a pdf
6
+ #
7
+ # Do Hoang Nhat Huy 21 Apr, 11
8
+ ###
9
+
10
+ use strict;
11
+
12
+ # Dependencies
13
+ use POSIX;
14
+ use IO::File;
15
+ use XML::Writer;
16
+ use XML::Writer::String;
17
+
18
+ use Class::Struct;
19
+
20
+ # Local libraries
21
+ use SectLabel::Config;
22
+ use ParsCit::PostProcess;
23
+
24
+ # Dictionary
25
+ my %dict = ();
26
+ # CRF++
27
+ my $crft = $SectLabel::Config::crf_test;
28
+
29
+ # Matching features of each author, including
30
+ # Signals
31
+ # Coordinations: top, bottom, left, right
32
+ # Position: page, sections, paragraph, line
33
+ struct aut_rcfeatures =>
34
+ {
35
+ signals => '@',
36
+
37
+ top => '$',
38
+ bottom => '$',
39
+ left => '$',
40
+ right => '$',
41
+
42
+ page => '$',
43
+ section => '$',
44
+ para => '$',
45
+ line => '$'
46
+ };
47
+
48
+ # Matching features of each affiliation, including
49
+ # Signals
50
+ # Coordinations: top, bottom, left, right
51
+ # Position: page, sections, paragraph, line
52
+ struct aff_rcfeatures =>
53
+ {
54
+ signals => '@',
55
+
56
+ top => '$',
57
+ bottom => '$',
58
+ left => '$',
59
+ right => '$',
60
+
61
+ page => '$',
62
+ section => '$',
63
+ para => '$',
64
+ line => '$'
65
+ };
66
+
67
+ # Author
68
+ # Affiliation
69
+ sub AAMatching
70
+ {
71
+ my ($doc, $aut_addrs, $aff_addrs) = @_;
72
+
73
+ my $need_object = 1;
74
+ # Get the author objects
75
+ my $aut_lines = Omni::Traversal::OmniCollector($doc, $aut_addrs, $need_object);
76
+ # Get the affiliation objects
77
+ my $aff_lines = Omni::Traversal::OmniCollector($doc, $aff_addrs, $need_object);
78
+
79
+ # Dictionary
80
+ ReadDict($FindBin::Bin . "/../" . $SectLabel::Config::dictFile);
81
+
82
+ # Authors
83
+ my ($aut_features, $aut_rc_features) = AuthorFeatureExtraction($aut_lines, $aut_addrs);
84
+ # Call CRF
85
+ my ($aut_signal, $aut_rc) = AuthorExtraction($aut_features, $aut_rc_features);
86
+
87
+ # Affiliations
88
+ my ($aff_features, $aff_rc_features) = AffiliationFeatureExtraction($aff_lines, $aff_addrs);
89
+ # Call CRF
90
+ my ($aff_signal, $aff_rc, $affs) = AffiliationExtraction($aff_features, $aff_rc_features);
91
+
92
+ # Matching features
93
+ my $aa_features = AAFeatureExtraction($aut_rc, $aff_rc);
94
+ # Matching
95
+ my $aa = AAMatchingImp($aa_features);
96
+
97
+ =pod
98
+ # DEBUG
99
+ my $aut_handle = undef;
100
+ my $aff_handle = undef;
101
+ my $aau_handle = undef;
102
+ my $aaf_handle = undef;
103
+ my $aut_debug = undef;
104
+ my $aff_debug = undef;
105
+ my $aa_handle = undef;
106
+
107
+ open $aut_handle, ">:utf8", "aut.features";
108
+ open $aff_handle, ">:utf8", "aff.features";
109
+ open $aau_handle, ">:utf8", "aau.features";
110
+ open $aaf_handle, ">:utf8", "aaf.features";
111
+ open $aut_debug, ">:utf8", "aut.debug.features";
112
+ open $aff_debug, ">:utf8", "aff.debug.features";
113
+ open $aa_handle, ">:utf8", "aa.features";
114
+
115
+ print $aut_handle $aut_features;
116
+ print $aff_handle $aff_features;
117
+ print $aau_handle $aut_rc_features;
118
+ print $aaf_handle $aff_rc_features;
119
+ print $aa_handle $aa_features, "\n";
120
+
121
+ foreach my $author (keys %{ $aut_rc } )
122
+ {
123
+ print $aut_debug $author, ": ", "\n";
124
+
125
+ foreach my $signal (@{ $aut_rc->{ $author }->signals })
126
+ {
127
+ print $aut_debug "\t", $signal, "\n";
128
+ }
129
+
130
+ print $aut_debug "\t", $aut_rc->{ $author }->top, "\n";
131
+ print $aut_debug "\t", $aut_rc->{ $author }->bottom, "\n";
132
+ print $aut_debug "\t", $aut_rc->{ $author }->left, "\n";
133
+ print $aut_debug "\t", $aut_rc->{ $author }->right, "\n";
134
+
135
+ print $aut_debug "\t", $aut_rc->{ $author }->page, "\n";
136
+ print $aut_debug "\t", $aut_rc->{ $author }->section, "\n";
137
+ print $aut_debug "\t", $aut_rc->{ $author }->para, "\n";
138
+ print $aut_debug "\t", $aut_rc->{ $author }->line, "\n";
139
+ }
140
+
141
+ foreach my $affiliation (keys %{ $aff_rc } )
142
+ {
143
+ print $aff_debug $affiliation, ": ", "\n";
144
+
145
+ foreach my $signal (@{ $aff_rc->{ $affiliation }->signals })
146
+ {
147
+ print $aff_debug "\t", $signal, "\n";
148
+ }
149
+
150
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->top, "\n";
151
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->bottom, "\n";
152
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->left, "\n";
153
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->right, "\n";
154
+
155
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->page, "\n";
156
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->section, "\n";
157
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->para, "\n";
158
+ print $aff_debug "\t", $aff_rc->{ $affiliation }->line, "\n";
159
+ }
160
+
161
+ close $aut_handle;
162
+ close $aff_handle;
163
+ close $aau_handle;
164
+ close $aaf_handle;
165
+ close $aut_debug;
166
+ close $aff_debug;
167
+ close $aa_handle;
168
+ # END
169
+ =cut
170
+
171
+ # Do the matching
172
+ # XML string
173
+ my $sxml = "";
174
+ # and XML writer
175
+ my $writer = new XML::Writer(OUTPUT => \$sxml, ENCODING => 'utf-8', DATA_MODE => 'true', DATA_INDENT => 2);
176
+
177
+ # Algorithm
178
+ $writer->startTag("algorithm", "name" => "AAMatching", "version" => $SectLabel::Config::algorithmVersion);
179
+
180
+ # XML header
181
+ my $date = `date`; chomp($date);
182
+ my $time = `date +%s`; chomp($time);
183
+ # Write XML header
184
+ $writer->startTag("results", "time" => $time, "date" => $date);
185
+
186
+ # Write authors
187
+ $writer->startTag("authors");
188
+
189
+ # Write the author name and his corresponding institution
190
+ foreach my $author (keys %{ $aut_signal })
191
+ {
192
+ $writer->startTag("author");
193
+
194
+ $writer->startTag("fullname", "source" => "parscit");
195
+ $writer->characters($author);
196
+ $writer->endTag("fullname");
197
+
198
+ $writer->startTag("institutions");
199
+ =pod
200
+ foreach my $signal (@{ $aut_signal->{ $author } })
201
+ {
202
+ $signal =~ s/^\s+|\s+$//g;
203
+ # Skip blank
204
+ if ($signal eq "") { next; }
205
+
206
+ $writer->startTag("institution", "symbol" => $signal);
207
+ $writer->characters($aff_signal->{ $signal });
208
+ $writer->endTag("institution");
209
+ }
210
+ =cut
211
+
212
+ foreach my $affiliation (@{ $aa->{ $author } })
213
+ {
214
+ $writer->startTag("institution");
215
+ $writer->characters($affiliation);
216
+ $writer->endTag("institution");
217
+ }
218
+
219
+ $writer->endTag("institutions");
220
+
221
+ $writer->endTag("author");
222
+ }
223
+
224
+ # Finish authors
225
+ $writer->endTag("authors");
226
+
227
+ # Write institutions
228
+ $writer->startTag("institutions");
229
+
230
+ # Write the instituion name
231
+ foreach my $institute (@{ $affs })
232
+ {
233
+ $writer->startTag("institution");
234
+ $writer->characters($institute);
235
+ $writer->endTag("institution");
236
+ }
237
+
238
+ $writer->endTag("institutions");
239
+
240
+ # Done
241
+ $writer->endTag("results");
242
+ # Done
243
+ $writer->endTag("algorithm");
244
+ # Done
245
+ $writer->end();
246
+
247
+ # Return the xml content back to the caller
248
+ return $sxml;
249
+ }
250
+
251
+ # Features of the relational classifier between author and affiliation
252
+ sub AAFeatureExtraction
253
+ {
254
+ my ($aut_rc, $aff_rc) = @_;
255
+
256
+ # Relational features
257
+ my $features = "";
258
+
259
+ # Features between x authors
260
+ foreach my $author (keys %{ $aut_rc })
261
+ {
262
+ my @aut_tokens = split /\s/, $author;
263
+ my $author_nb = join '|||', @aut_tokens;
264
+
265
+ my $min_aff_x = undef;
266
+ my $min_dist_x = LONG_MAX;
267
+ my $min_aff_y = undef;
268
+ my $min_dist_y = LONG_MAX;
269
+ # Find the nearest affiliation
270
+ foreach my $aff (keys %{ $aff_rc })
271
+ {
272
+ my $aut_x = ($aut_rc->{ $author }->left + $aut_rc->{ $author }->right) / 2;
273
+ my $aut_y = ($aut_rc->{ $author }->top + $aut_rc->{ $author }->bottom) / 2;
274
+
275
+ my $aff_x = ($aff_rc->{ $aff }->left + $aff_rc->{ $aff }->right) / 2;
276
+ my $aff_y = ($aff_rc->{ $aff }->top + $aff_rc->{ $aff }->bottom) / 2;
277
+
278
+ my $dis_x = abs( $aut_x - $aff_x );
279
+ my $dis_y = abs( $aut_y - $aff_y );
280
+ # Distance between an author and an affiliation
281
+ # my $distance = sqrt( $dis_x * $dis_x + $dis_y * $dis_y );
282
+
283
+ # Check if it it the minimum distance in x axis
284
+ if ($dis_x < $min_dist_x)
285
+ {
286
+ $min_dist_x = $dis_x;
287
+ $min_aff_x = $aff;
288
+ }
289
+
290
+ # Check if it it the minimum distance in y axis
291
+ if ($dis_y < $min_dist_y)
292
+ {
293
+ $min_dist_y = $dis_y;
294
+ $min_aff_y = $aff;
295
+ }
296
+ }
297
+
298
+ # and y affiliation
299
+ foreach my $aff (keys %{ $aff_rc })
300
+ {
301
+ my @aff_tokens = split /\s/, $aff;
302
+ my $aff_nb = join '|||', @aff_tokens;
303
+
304
+ # Content
305
+ $features .= $author_nb . "#" . $aff_nb . "\t";
306
+
307
+ my $signal = undef;
308
+ # Signal
309
+ if ((scalar(@{ $aut_rc->{ $author }->signals }) == 0) || (scalar(@{ $aff_rc->{ $aff }->signals }) == 0))
310
+ {
311
+ $signal = "diff";
312
+ }
313
+ else
314
+ {
315
+ my $matched = undef;
316
+ # Check each author signal
317
+ foreach my $aut_sig (@{ $aut_rc->{ $author }->signals })
318
+ {
319
+ # if it match with affiliation signal
320
+ if ($aut_sig eq ${ $aff_rc->{ $aff }->signals }[ 0 ]) { $matched = 1; last; }
321
+ }
322
+
323
+ $signal = (! defined $matched) ? "diff" : "same";
324
+ }
325
+
326
+ # Signal
327
+ $features .= $signal . "\t";
328
+
329
+ # Same page
330
+ my $page = ($aut_rc->{ $author }->page == $aff_rc->{ $aff }->page) ? "yes" : "no";
331
+ $features .= $page . "\t";
332
+
333
+ my $section = undef;
334
+ # Same section
335
+ if ($page eq "yes")
336
+ {
337
+ $section = ($aut_rc->{ $author }->section == $aff_rc->{ $aff }->section) ? "yes" : "no";
338
+ $features .= $section . "\t";
339
+ }
340
+ else
341
+ {
342
+ $section = "no";
343
+ $features .= $section . "\t";
344
+ }
345
+
346
+ my $para = undef;
347
+ # Same paragraph
348
+ if (($page eq "yes") && ($section eq "yes"))
349
+ {
350
+ $para = ($aut_rc->{ $author }->para == $aff_rc->{ $aff }->para) ? "yes" : "no";
351
+ $features .= $para . "\t";
352
+ }
353
+ else
354
+ {
355
+ $para = "no";
356
+ $features .= $para . "\t";
357
+ }
358
+
359
+ my $line = undef;
360
+ # Same line
361
+ if (($page eq "yes") && ($section eq "yes") && ($para eq "yes"))
362
+ {
363
+ $line = ($aut_rc->{ $author }->line == $aff_rc->{ $aff }->line) ? "yes" : "no";
364
+ $features .= $line . "\t";
365
+ }
366
+ else
367
+ {
368
+ $line = "no";
369
+ $features .= $line . "\t";
370
+ }
371
+
372
+ # Is neartest affiliation in x axis ?
373
+ my $nearest_x = ($aff eq $min_aff_x) ? "yes" : "no";
374
+ $features .= $nearest_x . "\t";
375
+
376
+ # Is neartest affiliation in y axis ?
377
+ my $nearest_y = ($aff eq $min_aff_y) ? "yes" : "no";
378
+ $features .= $nearest_y . "\n";
379
+ }
380
+ }
381
+
382
+ return $features;
383
+ }
384
+
385
+ # Actually do the matching between author and affiliation
386
+ sub AAMatchingImp
387
+ {
388
+ my ($features) = @_;
389
+
390
+ # Temporary input file for CRF
391
+ my $infile = BuildTmpFile("aa-input");
392
+ # Temporary output file for CRF
393
+ my $outfile = BuildTmpFile("aa-output");
394
+
395
+ my $output_handle = undef;
396
+ # Split and write to temporary input
397
+ open $output_handle, ">:utf8", $infile;
398
+ # Split
399
+ my @lines = split /\n/, $features;
400
+ # and write
401
+ foreach my $line (@lines)
402
+ {
403
+ if ($line eq "")
404
+ {
405
+ print $output_handle "\n";
406
+ }
407
+ else
408
+ {
409
+ print $output_handle $line, "\t", "no", "\n";
410
+ }
411
+ }
412
+ # Done
413
+ close $output_handle;
414
+
415
+ # AA matching model
416
+ my $match_model = $SectLabel::Config::matFile;
417
+
418
+ # Matching
419
+ system("$crft -m $match_model $infile > $outfile");
420
+
421
+ # List of authors and their affiliation (if exists)
422
+ my %aa = ();
423
+
424
+ my $input_handle = undef;
425
+ # Read the CRF output
426
+ open $input_handle, "<:utf8", $outfile;
427
+ # Read each line and get its label
428
+ while (<$input_handle>)
429
+ {
430
+ my $line = $_;
431
+ # Trim
432
+ $line =~ s/^\s+|\s+$//g;
433
+ # Blank linem, what the heck ?
434
+ if ($line eq "") { next; }
435
+
436
+ # Split the line
437
+ my @fields = split /\t/, $line;
438
+ # and extract the class and the content
439
+ my $class = $fields[ -1 ];
440
+ my $content = $fields[ 0 ];
441
+
442
+ # You miss
443
+ if ($class ne "yes") { next; }
444
+
445
+ # Split the content into author name and affiliation name
446
+ my @tmp = split /#/, $content;
447
+ # Author name
448
+ my $author = $tmp[ 0 ];
449
+ $author =~ s/\|\|\|/ /g;
450
+ # Affiliation name
451
+ my $aff = $tmp[ 1 ];
452
+ $aff =~ s/\|\|\|/ /g;
453
+
454
+ # Save
455
+ if (! exists $aa{ $author }) { $aa{ $author } = (); }
456
+ # Save
457
+ push @{ $aa{ $author } }, $aff;
458
+ }
459
+
460
+ # Done
461
+ close $input_handle;
462
+
463
+ # Clean up
464
+ unlink $infile;
465
+ unlink $outfile;
466
+ # Done
467
+ return (\%aa);
468
+ }
469
+
470
+ # Extract affiliation and their signal using crf
471
+ sub AffiliationExtraction
472
+ {
473
+ my ($features, $rc_features) = @_;
474
+
475
+ # Temporary input file for CRF
476
+ my $infile = BuildTmpFile("aff-input");
477
+ # Temporary output file for CRF
478
+ my $outfile = BuildTmpFile("aff-output");
479
+
480
+ my $output_handle = undef;
481
+ # Split and write to temporary input
482
+ open $output_handle, ">:utf8", $infile;
483
+ # Split
484
+ my @lines = split /\n/, $features;
485
+ # and write
486
+ foreach my $line (@lines)
487
+ {
488
+ if ($line eq "")
489
+ {
490
+ print $output_handle "\n";
491
+ }
492
+ else
493
+ {
494
+ print $output_handle $line, "\t", "affiliation", "\n";
495
+ }
496
+ }
497
+ # Done
498
+ close $output_handle;
499
+
500
+ # Author model
501
+ my $aff_model = $SectLabel::Config::affFile;
502
+
503
+ # Split the authors
504
+ system("$crft -m $aff_model $infile > $outfile");
505
+
506
+ # Each affiliation can have only one signal
507
+ my %asg = ();
508
+ # Each affilitiaon can have only one struct
509
+ my %aaf = ();
510
+ # List of all affiliations
511
+ my @aff = ();
512
+
513
+ # Each line in the relational features string
514
+ my @rc_lines = split /\n/, $rc_features;
515
+
516
+ my $input_handle = undef;
517
+ # Read the CRF output
518
+ open $input_handle, "<:utf8", $outfile;
519
+ # Author and signal string
520
+ my $prev_class = "";
521
+ my @aff_str = ();
522
+ my $signal_str = "";
523
+ # Relational classifier
524
+ my @aaf_rc = ();
525
+ # Line counter
526
+ my $counter = 0;
527
+ # Next to last signal
528
+ my $ntl_signal = "";
529
+ # Read each line and get its label
530
+ # TODO: The code assumes that an affiliation will have the following format: 1 foobar institute
531
+ while (<$input_handle>)
532
+ {
533
+ my $line = $_;
534
+ # Trim
535
+ $line =~ s/^\s+|\s+$//g;
536
+ # Blank line mark the end of an affiliation section
537
+ if ($line eq "")
538
+ {
539
+ if ($prev_class eq "affiliation")
540
+ {
541
+ my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
542
+ # Save the affiliation
543
+ push @aff, $affiliation;
544
+ # and its signal
545
+ if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
546
+
547
+ # Save the signal
548
+ push @{ $rcs->signals }, $ntl_signal;
549
+ # Save the record
550
+ $aaf{ $affiliation } = $rcs;
551
+ }
552
+ elsif ($prev_class eq "signal")
553
+ {
554
+ # Save the next to last signal
555
+ $ntl_signal = NormalizeAffiliationSignal($signal_str);
556
+ }
557
+
558
+ # Cleanup
559
+ $ntl_signal = "";
560
+ # Cleanup
561
+ @aff_str = ();
562
+ $signal_str = "";
563
+ $prev_class = "";
564
+ # Cleanup
565
+ @aaf_rc = ();
566
+
567
+ # Update the counter
568
+ $counter++;
569
+
570
+ next ;
571
+ }
572
+
573
+ # Split the line
574
+ my @fields = split /\t/, $line;
575
+ # and extract the class and the content
576
+ my $class = $fields[ -1 ];
577
+ my $content = $fields[ 0 ];
578
+
579
+ if ($class eq $prev_class)
580
+ {
581
+ # An affiliation
582
+ if ($class eq "affiliation")
583
+ {
584
+ push @aff_str, $content;
585
+ push @aaf_rc, $rc_lines[ $counter ];
586
+ }
587
+ # A signal
588
+ elsif ($class eq "signal")
589
+ {
590
+ $signal_str .= $content . " ";
591
+ }
592
+ }
593
+ else
594
+ {
595
+ if ($prev_class eq "affiliation")
596
+ {
597
+ my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
598
+ # Save the affiliation
599
+ push @aff, $affiliation;
600
+ # and its signal
601
+ if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
602
+
603
+ # Save the signal
604
+ push @{ $rcs->signals }, $ntl_signal;
605
+ # Save the record
606
+ $aaf{ $affiliation } = $rcs;
607
+
608
+ }
609
+ elsif ($prev_class eq "signal")
610
+ {
611
+ # Save the next to last signal
612
+ $ntl_signal = NormalizeAffiliationSignal($signal_str);
613
+ }
614
+
615
+ # Cleanup
616
+ @aff_str = ();
617
+ $signal_str = "";
618
+ @aaf_rc = ();
619
+ # Switch to the current class
620
+ $prev_class = $class;
621
+
622
+ if ($class eq "affiliation")
623
+ {
624
+ push @aff_str, $content;
625
+ push @aaf_rc, $rc_lines[ $counter ];
626
+ }
627
+ elsif ($class eq "signal")
628
+ {
629
+ $signal_str .= $content . " ";
630
+ }
631
+ }
632
+
633
+ # Update the counter
634
+ $counter++;
635
+ }
636
+
637
+ # Final class
638
+ if ($prev_class eq "affiliation")
639
+ {
640
+ my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
641
+ # Save the affiliation
642
+ push @aff, $affiliation;
643
+ # and its signal
644
+ if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
645
+
646
+ # Save the signal
647
+ push @{ $rcs->signals }, $ntl_signal;
648
+ # Save the record
649
+ $aaf{ $affiliation } = $rcs;
650
+ }
651
+ elsif ($prev_class eq "signal")
652
+ {
653
+ # Save the next to last signal
654
+ $ntl_signal = NormalizeAffiliationSignal($signal_str);
655
+ }
656
+
657
+ # Done
658
+ close $input_handle;
659
+
660
+ # Clean up
661
+ unlink $infile;
662
+ unlink $outfile;
663
+ # Done
664
+ return (\%asg, \%aaf, \@aff);
665
+ }
666
+
667
+ sub NormalizeAffiliationSignal
668
+ {
669
+ my ($signal_str) = @_;
670
+
671
+ # Trim
672
+ $signal_str =~ s/^\s+|\s+$//g;
673
+ # Remove all space inside the signature
674
+ $signal_str =~ s/\s+//g;
675
+
676
+ # Done
677
+ return $signal_str;
678
+ }
679
+
680
+ sub NormalizeAffiliationName
681
+ {
682
+ my ($aff_str, $aaf_rc) = @_;
683
+
684
+ # Constraint
685
+ if (scalar(@{ $aff_str }) != scalar(@{ $aaf_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
686
+
687
+ # Affiliation string
688
+ my $affiliation = join ' ', @{ $aff_str };
689
+
690
+ # First word
691
+ my @fields = split /\s/, $aaf_rc->[ 0 ];
692
+ # Save the relational features of an affiliation (its first word)
693
+ my $rcs = aff_rcfeatures->new( signals => [],
694
+ top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
695
+ page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
696
+ # Done
697
+ return ($affiliation, $rcs);
698
+ }
699
+
700
+ # Extract author name and their signal using crf
701
+ sub AuthorExtraction
702
+ {
703
+ my ($features, $rc_features) = @_;
704
+
705
+ # Temporary input file for CRF
706
+ my $infile = BuildTmpFile("aut-input");
707
+ # Temporary output file for CRF
708
+ my $outfile = BuildTmpFile("aut-output");
709
+
710
+ my $output_handle = undef;
711
+ # Split and write to temporary input
712
+ open $output_handle, ">:utf8", $infile;
713
+ # Split
714
+ my @lines = split /\n/, $features;
715
+ # and write
716
+ foreach my $line (@lines)
717
+ {
718
+ if ($line eq "")
719
+ {
720
+ print $output_handle "\n";
721
+ }
722
+ else
723
+ {
724
+ print $output_handle $line, "\t", "ns", "\n";
725
+ }
726
+ }
727
+ # Done
728
+ close $output_handle;
729
+
730
+ # Author model
731
+ my $author_model = $SectLabel::Config::autFile;
732
+
733
+ # Split the authors
734
+ system("$crft -m $author_model $infile > $outfile");
735
+
736
+ # Each author can have one or more signals
737
+ my %asg = ();
738
+ # Each author can have only one struct
739
+ my %aas = ();
740
+
741
+ # Each line in the relational features string
742
+ my @rc_lines = split /\n/, $rc_features;
743
+
744
+ my $input_handle = undef;
745
+ # Read the CRF output
746
+ open $input_handle, "<:utf8", $outfile;
747
+ # Author and signal string
748
+ my $prev_class = "";
749
+ my @author_str = ();
750
+ my $signal_str = "";
751
+ # Relational classifier
752
+ my @author_rc = ();
753
+ # Line counter
754
+ my $counter = 0;
755
+ # Next to last authors
756
+ my %ntl_asg = ();
757
+ #
758
+ my $is_authors = 0;
759
+ # Read each line and get its label
760
+ while (<$input_handle>)
761
+ {
762
+ my $line = $_;
763
+ # Trim
764
+ $line =~ s/^\s+|\s+$//g;
765
+ # Blank line mark the end of an author section
766
+ if ($line eq "")
767
+ {
768
+ if ($prev_class eq "author")
769
+ {
770
+ my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
771
+ # Save each author
772
+ for (my $i = 0; $i < scalar(@{ $authors }); $i++)
773
+ {
774
+ $asg{ $authors->[ $i ] } = ();
775
+ $aas{ $authors->[ $i ] } = $rcs->[ $i ];
776
+ $ntl_asg{ $authors->[ $i ] } = 0;
777
+ }
778
+ }
779
+ elsif ($prev_class eq "signal")
780
+ {
781
+ my $signals = NormalizeAuthorSignal($signal_str);
782
+ # Save each signal to its corresponding author
783
+ foreach my $author (keys %ntl_asg)
784
+ {
785
+ foreach my $signal (@{ $signals })
786
+ {
787
+ push @{ $asg{ $author } }, $signal;
788
+ push @{ $aas{ $author }->signals }, $signal;
789
+ }
790
+ }
791
+ }
792
+
793
+ # Cleanup
794
+ %ntl_asg = ();
795
+ # Cleanup
796
+ @author_str = ();
797
+ $signal_str = "";
798
+ @author_rc = ();
799
+ # Cleanup
800
+ $prev_class = "";
801
+
802
+ # Update the counter
803
+ $counter++;
804
+
805
+ #
806
+ $is_authors = 0;
807
+
808
+ next;
809
+ }
810
+
811
+ # Split the line
812
+ my @fields = split /\t/, $line;
813
+ # and extract the class and the content
814
+ my $class = $fields[ -1 ];
815
+ my $content = $fields[ 0 ];
816
+
817
+ if ($class eq $prev_class)
818
+ {
819
+ # An author
820
+ if ($class eq "author")
821
+ {
822
+ push @author_str, $content;
823
+ push @author_rc, $rc_lines[ $counter ];
824
+ }
825
+ # A signal
826
+ elsif ($class eq "signal")
827
+ {
828
+ $signal_str .= $content . " ";
829
+ }
830
+ }
831
+ else
832
+ {
833
+ if ($prev_class eq "author")
834
+ {
835
+ my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
836
+ # Save each author
837
+ for (my $i = 0; $i < scalar(@{ $authors }); $i++)
838
+ {
839
+ $asg{ $authors->[ $i ] } = ();
840
+ $aas{ $authors->[ $i ] } = $rcs->[ $i ];
841
+ $ntl_asg{ $authors->[ $i ] } = 0;
842
+ }
843
+ }
844
+ elsif ($prev_class eq "signal")
845
+ {
846
+ my $signals = NormalizeAuthorSignal($signal_str);
847
+ # Save each signal to its corresponding author
848
+ foreach my $author (keys %ntl_asg)
849
+ {
850
+ foreach my $signal (@{ $signals })
851
+ {
852
+ push @{ $asg{ $author } }, $signal;
853
+ push @{ $aas{ $author }->signals }, $signal;
854
+ }
855
+ }
856
+ }
857
+
858
+ # Clean the next to last author list if this current class is author
859
+ if (($is_authors == 0) && ($class eq "author")) { %ntl_asg = (); $is_authors = 1; }
860
+ #
861
+ if ($class eq "signal") { $is_authors = 0; }
862
+
863
+ # Cleanup
864
+ @author_str = ();
865
+ $signal_str = "";
866
+ @author_rc = ();
867
+ # Switch to the current class
868
+ $prev_class = $class;
869
+
870
+ if ($class eq "author")
871
+ {
872
+ push @author_str, $content;
873
+ push @author_rc, $rc_lines[ $counter ];
874
+ }
875
+ elsif ($class eq "signal")
876
+ {
877
+ $signal_str .= $content . " ";
878
+ }
879
+ }
880
+
881
+ # Update the counter
882
+ $counter++;
883
+ }
884
+
885
+ # Final class
886
+ if ($prev_class eq "author")
887
+ {
888
+ my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
889
+ # Save each author
890
+ for (my $i = 0; $i < scalar(@{ $authors }); $i++)
891
+ {
892
+ $asg{ $authors->[ $i ] } = ();
893
+ $aas{ $authors->[ $i ] } = $rcs->[ $i ];
894
+ $ntl_asg{ $authors->[ $i ] } = 0;
895
+ }
896
+ }
897
+ elsif ($prev_class eq "signal")
898
+ {
899
+ my $signals = NormalizeAuthorSignal($signal_str);
900
+ # Save each signal to its corresponding author
901
+ foreach my $author (keys %ntl_asg)
902
+ {
903
+ foreach my $signal (@{ $signals })
904
+ {
905
+ push @{ $asg{ $author } }, $signal;
906
+ push @{ $aas{ $author }->signals }, $signal;
907
+ }
908
+ }
909
+ }
910
+
911
+ # Done
912
+ close $input_handle;
913
+
914
+ # Clean up
915
+ unlink $infile;
916
+ unlink $outfile;
917
+ # Done
918
+ return (\%asg, \%aas);
919
+ }
920
+
921
+ sub NormalizeAuthorNames
922
+ {
923
+ my ($author_str, $author_rc) = @_;
924
+
925
+ # Constraint
926
+ if (scalar(@{ $author_str }) != scalar(@{ $author_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
927
+
928
+ # Mark the beginning of an author name
929
+ my $begin = 1;
930
+ # and its corresponding relational features
931
+ my $rcbegin = 0;
932
+
933
+ my @current = ();
934
+ my @authors = ();
935
+ my @rcs = ();
936
+ # Check all tokens in the author string
937
+ for (my $i = 0; $i < scalar(@{ $author_str }); $i++)
938
+ {
939
+ my $token = $author_str->[ $i ];
940
+
941
+ # Mark the end of an author name
942
+ if ($token =~ m/^(&|and|,|;)$/i)
943
+ {
944
+ if (scalar(@current) != 0)
945
+ {
946
+ push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
947
+
948
+ # Save the relational features of an author (its first word)
949
+ my @fields = split /\s/, $author_rc->[ $rcbegin ];
950
+ # Create new record
951
+ my $tmp = aut_rcfeatures->new( signals => [],
952
+ top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
953
+ page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
954
+ # Save the record
955
+ push @rcs, $tmp;
956
+ }
957
+
958
+ # Cleanup
959
+ @current = ();
960
+ $begin = 1;
961
+
962
+ next;
963
+ }
964
+
965
+ # Mark the begin of an author name
966
+ if ($begin == 1)
967
+ {
968
+ push @current, $token;
969
+
970
+ $begin = 0;
971
+ $rcbegin = $i;
972
+
973
+ next;
974
+ }
975
+
976
+ # Author name ending with a comma
977
+ if ($token =~ m/,$/)
978
+ {
979
+ push @current, $token;
980
+
981
+ if (scalar(@current) != 0)
982
+ {
983
+ push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
984
+
985
+ # Save the relational features of an author (its first word)
986
+ my @fields = split /\s/, $author_rc->[ $rcbegin ];
987
+ # Create new record
988
+ my $tmp = aut_rcfeatures->new( signals => [],
989
+ top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
990
+ page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
991
+ # Save the record
992
+ push @rcs, $tmp;
993
+ }
994
+
995
+ # Cleanup
996
+ @current = ();
997
+ $begin = 1;
998
+ }
999
+ # or it's just parts of the name
1000
+ else
1001
+ {
1002
+ push @current, $token;
1003
+ }
1004
+ }
1005
+
1006
+ # Last author name
1007
+ if (scalar(@current) != 0)
1008
+ {
1009
+ push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
1010
+
1011
+ # Save the relational features of an author (its first word)
1012
+ my @fields = split /\s/, $author_rc->[ $rcbegin ];
1013
+ # Create new record
1014
+ my $tmp = aut_rcfeatures->new( signals => [],
1015
+ top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
1016
+ page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ] );
1017
+ # Save the record
1018
+ push @rcs, $tmp;
1019
+ }
1020
+
1021
+ # Done
1022
+ return (\@authors, \@rcs);
1023
+ }
1024
+
1025
+ #
1026
+ sub NormalizeAuthorSignal
1027
+ {
1028
+ my ($signal_str) = @_;
1029
+
1030
+ # Trim
1031
+ $signal_str =~ s/^\s+|\s+$//g;
1032
+ # Split into individual signal
1033
+ my @signals = split / |,|:|;/, $signal_str;
1034
+
1035
+ # Done
1036
+ return \@signals;
1037
+ }
1038
+
1039
+ # Extract features from affiliation lines
1040
+ # The list of features include
1041
+ # Content
1042
+ # Content, lower case, no punctuation
1043
+ # Content length
1044
+ # First word in line
1045
+ #
1046
+ # XML features
1047
+ # Subscript, superscript
1048
+ # Bold
1049
+ # Italic
1050
+ # Underline
1051
+ # Relative font size
1052
+ # Differentiate features
1053
+ sub AffiliationFeatureExtraction
1054
+ {
1055
+ my ($aff_lines, $aff_addrs) = @_;
1056
+
1057
+ # NOTE: Relational classifier features
1058
+ my $rc_features = "";
1059
+
1060
+ # Features will be stored here
1061
+ my $features = "";
1062
+ # First word in line
1063
+ my $is_first_line = undef;
1064
+
1065
+ # Font size
1066
+ my %fonts = ();
1067
+ # Each line contains many runs
1068
+ foreach my $line (@{ $aff_lines })
1069
+ {
1070
+ my $runs = $line->get_objs_ref();
1071
+ # Iterator though all work in all lines
1072
+ foreach my $run (@{ $runs })
1073
+ {
1074
+ my $fsize = $run->get_font_size();
1075
+ my $words = $run->get_objs_ref();
1076
+
1077
+ # Statistic
1078
+ if (! exists $fonts{ $fsize })
1079
+ {
1080
+ $fonts{ $fsize } = scalar(@{ $words });
1081
+ }
1082
+ else
1083
+ {
1084
+ $fonts{ $fsize } += scalar(@{ $words });
1085
+ }
1086
+ }
1087
+ }
1088
+
1089
+ my $dominate_font = undef;
1090
+ # Sort all the font descend with the number of their appearance
1091
+ my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
1092
+ # Select the dominated font
1093
+ $dominate_font = $sorted[ 0 ];
1094
+
1095
+ my $size_mismatch = undef;
1096
+ # TODO: serious error if the size of aff_lines and the size of aff_addrs mismatch
1097
+ if (scalar(@{ $aff_lines }) != scalar(@{ $aff_addrs }))
1098
+ {
1099
+ $size_mismatch = 1;
1100
+ # Print the error but still try to continue
1101
+ print STDERR "# Total number of affiliation lines (" . scalar(@{ $aff_lines }) . ") != Total number of affiliation addresses (" . scalar(@{ $aff_addrs }) . ")." . "\n";
1102
+ }
1103
+
1104
+ my $prev_page = undef;
1105
+ my $prev_sect = undef;
1106
+ my $prev_para = undef;
1107
+ # Each line contains many runs
1108
+ for (my $counter = 0; $counter < scalar(@{ $aff_lines }); $counter++)
1109
+ {
1110
+ # Get the line object
1111
+ my $line = $aff_lines->[ $counter ];
1112
+
1113
+ # Check the size of aff_lines and aff_addrs
1114
+ if (! defined $size_mismatch)
1115
+ {
1116
+ # Check if two consecutive lines are from two different sections
1117
+ if (! defined $prev_page)
1118
+ {
1119
+ # Init
1120
+ $prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
1121
+ $prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
1122
+ $prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
1123
+ }
1124
+ else
1125
+ {
1126
+ # Affiliations from different sections will be separated immediately
1127
+ if (($prev_page != $aff_addrs->[ $counter ]->{ 'L1' }) ||
1128
+ ($prev_sect != $aff_addrs->[ $counter ]->{ 'L2' }) ||
1129
+ ($prev_para != $aff_addrs->[ $counter ]->{ 'L3' }))
1130
+ {
1131
+ $features .= "\n";
1132
+
1133
+ # NOTE: Relational classifier features
1134
+ $rc_features .= "\n";
1135
+ }
1136
+
1137
+ # Save the paragraph index
1138
+ $prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
1139
+ $prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
1140
+ $prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
1141
+ }
1142
+ }
1143
+
1144
+ # Set first word in line
1145
+ $is_first_line = 1;
1146
+
1147
+ # Two previous words
1148
+ my $prev_word = undef;
1149
+ my $prev_prev_word = undef;
1150
+
1151
+ # Format of the previous word
1152
+ my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
1153
+
1154
+ my $runs = $line->get_objs_ref();
1155
+ # Iterator though all work in all lines
1156
+ foreach my $run (@{ $runs })
1157
+ {
1158
+ # The run must be non-empty
1159
+ my $tmp = $run->get_content();
1160
+ # Trim
1161
+ $tmp =~ s/^\s+|\s+$//g;
1162
+ # Skip blank run
1163
+ if ($tmp eq "") { next; }
1164
+
1165
+ ###
1166
+ # The following features are XML features
1167
+ ###
1168
+
1169
+ # Bold format
1170
+ my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
1171
+
1172
+ # Italic format
1173
+ my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
1174
+
1175
+ # Underline
1176
+ my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
1177
+
1178
+ # Sub-Sup-script
1179
+ my $suscript = ($run->get_suscript() eq "superscript") ? "super" :
1180
+ ($run->get_suscript() eq "subscript") ? "sub" : "none";
1181
+
1182
+ # Relative font size
1183
+ my $fontsize = ($run->get_font_size() > $dominate_font) ? "large" :
1184
+ ($run->get_font_size() < $dominate_font) ? "small" : "normal";
1185
+
1186
+ ###
1187
+ # End of XML features
1188
+ ###
1189
+
1190
+ # All words in the run
1191
+ my $words = $run->get_objs_ref();
1192
+
1193
+ # For each word
1194
+ foreach my $word (@{ $words })
1195
+ {
1196
+ # Get word location
1197
+ my $top = $word->get_top_pos();
1198
+ my $bottom = $word->get_bottom_pos();
1199
+ my $left = $word->get_left_pos();
1200
+ my $right = $word->get_right_pos();
1201
+
1202
+ # NOTE: heuristic rule, for words in the same line
1203
+ # If the x-axis distance between this word and the previous word is
1204
+ # three times larger than the distance between the previous word and
1205
+ # the word before it, then it marks the separator.
1206
+ # The better way to do this is to introduce it as a new feature in the
1207
+ # author and affiliation model but this step requires re-training these
1208
+ # two models, so ...
1209
+ #
1210
+ # NOTE: Assuming left to right writing
1211
+ if (! defined $prev_word)
1212
+ {
1213
+ $prev_word = $word;
1214
+ }
1215
+ elsif (! defined $prev_prev_word)
1216
+ {
1217
+ # NOTE: Words have the power to both destroy and heal, when words are both
1218
+ # true and kind, they can change our world
1219
+ if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1220
+ {
1221
+ $prev_prev_word = $prev_word;
1222
+ $prev_word = $word;
1223
+ }
1224
+ }
1225
+ else
1226
+ {
1227
+ # NOTE: Words have the power to both destroy and heal, when words are both
1228
+ # true and kind, they can change our world
1229
+ if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1230
+ {
1231
+ my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
1232
+ my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
1233
+
1234
+ if ($prev_dist * 5 < $curr_dist)
1235
+ {
1236
+ $features .= "\n";
1237
+
1238
+ # NOTE: Relational classifier features
1239
+ $rc_features .= "\n";
1240
+ }
1241
+
1242
+ $prev_prev_word = $prev_word;
1243
+ $prev_word = $word;
1244
+ }
1245
+ }
1246
+
1247
+ # Extract features
1248
+ my $full_content = $word->get_content();
1249
+ # Trim
1250
+ $full_content =~ s/^\s+|\s+$//g;
1251
+
1252
+ # Skip blank run
1253
+ if ($full_content eq "") { next; }
1254
+
1255
+ my @sub_content = ();
1256
+ # This is the tricky part, one word e.g. **affiliation will be
1257
+ # splitted into two parts: the signal, and the affiliation if
1258
+ # possible using regular expression
1259
+ while ($full_content =~ m/([\w|-]*)(\W*)/g)
1260
+ {
1261
+ my $first = $1;
1262
+ my $second = $2;
1263
+
1264
+ # Trim
1265
+ $first =~ s/^\s+|\s+$//g;
1266
+ $second =~ s/^\s+|\s+$//g;
1267
+
1268
+ # Only keep non-blank content
1269
+ if ($first ne "") { push @sub_content, $first; }
1270
+
1271
+ # Check the signal and separator
1272
+ while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
1273
+ {
1274
+ my $sub_first = $1;
1275
+ my $sub_second = $2;
1276
+
1277
+ # Trim
1278
+ $sub_first =~ s/^\s+|\s+$//g;
1279
+ $sub_second =~ s/^\s+|\s+$//g;
1280
+
1281
+ # Only keep non-blank separator
1282
+ if ($sub_first ne "") { push @sub_content, $sub_first; }
1283
+ # Only keep non-blank signal
1284
+ if ($sub_second ne "") { push @sub_content, $sub_second; }
1285
+ }
1286
+ }
1287
+
1288
+ foreach my $content (@sub_content)
1289
+ {
1290
+ # Content
1291
+ $features .= $content . "\t";
1292
+
1293
+ my $content_n = $content;
1294
+ # Remove punctuation
1295
+ $content_n =~ s/[^\w]//g;
1296
+ # Lower case
1297
+ my $content_l = lc($content);
1298
+ # Lower case, no punctuation
1299
+ my $content_nl = lc($content_n);
1300
+ # Lower case
1301
+ $features .= $content_l . "\t";
1302
+ # Lower case, no punctuation
1303
+ if ($content_nl ne "")
1304
+ {
1305
+ $features .= $content_nl . "\t";
1306
+ }
1307
+ else
1308
+ {
1309
+ $features .= $content_l . "\t";
1310
+ }
1311
+
1312
+ # Split into character
1313
+ my @chars = split(//, $content);
1314
+ # Content length
1315
+ my $length = (scalar(@chars) == 1) ? "1-char" :
1316
+ (scalar(@chars) == 2) ? "2-char" :
1317
+ (scalar(@chars) == 3) ? "3-char" : "4+char";
1318
+ $features .= $length . "\t";
1319
+
1320
+ # First word in line
1321
+ if ($is_first_line == 1)
1322
+ {
1323
+ $features .= "begin" . "\t";
1324
+
1325
+ # Next words are not the first in line anymore
1326
+ $is_first_line = 0;
1327
+ }
1328
+ else
1329
+ {
1330
+ $features .= "continue" . "\t";
1331
+ }
1332
+
1333
+ ###
1334
+ # The following features are XML features
1335
+ ###
1336
+
1337
+ # Bold format
1338
+ $features .= $bold . "\t";
1339
+
1340
+ # Italic format
1341
+ $features .= $italic . "\t";
1342
+
1343
+ # Underline
1344
+ $features .= $underline . "\t";
1345
+
1346
+ # Sub-Sup-script
1347
+ $features .= $suscript . "\t";
1348
+
1349
+ # Relative font size
1350
+ $features .= $fontsize . "\t";
1351
+
1352
+ # First word in run
1353
+ if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
1354
+ {
1355
+ $features .= "fbegin" . "\t";
1356
+ }
1357
+ else
1358
+ {
1359
+ $features .= "fcontinue" . "\t";
1360
+ }
1361
+
1362
+ # New token
1363
+ $features .= "\n";
1364
+
1365
+ # Save the XML format
1366
+ $prev_bold = $bold;
1367
+ $prev_italic = $italic;
1368
+ $prev_underline = $underline;
1369
+ $prev_suscript = $suscript;
1370
+ $prev_fontsize = $fontsize;
1371
+
1372
+ # NOTE: Relational classifier features
1373
+ # Content
1374
+ $rc_features .= $content . "\t";
1375
+ # Location
1376
+ $rc_features .= $top . "\t";
1377
+ $rc_features .= $bottom . "\t";
1378
+ $rc_features .= $left . "\t";
1379
+ $rc_features .= $right . "\t";
1380
+ # Index
1381
+ if (! defined $size_mismatch)
1382
+ {
1383
+ $rc_features .= $aff_addrs->[ $counter ]->{ 'L1' } . "\t";
1384
+ $rc_features .= $aff_addrs->[ $counter ]->{ 'L2' } . "\t";
1385
+ $rc_features .= $aff_addrs->[ $counter ]->{ 'L3' } . "\t";
1386
+ $rc_features .= $aff_addrs->[ $counter ]->{ 'L4' } . "\t";
1387
+ }
1388
+ # Done
1389
+ $rc_features .= "\n";
1390
+ }
1391
+ }
1392
+ }
1393
+ }
1394
+
1395
+ return ($features, $rc_features);
1396
+
1397
+ }
1398
+
1399
+ # Extract features from author lines
1400
+ # The list of features include
1401
+ # Content
1402
+ # Content, lower case, no punctuation
1403
+ # Content length
1404
+ # Capitalization
1405
+ # Numeric property
1406
+ # Last punctuation
1407
+ # First 4-gram
1408
+ # Last 4-gram
1409
+ # Dictionary
1410
+ # First word in line
1411
+ #
1412
+ # XML features
1413
+ # Subscript, superscript
1414
+ # Bold
1415
+ # Italic
1416
+ # Underline
1417
+ # Relative font size
1418
+ # Differentiate features
1419
+ sub AuthorFeatureExtraction
1420
+ {
1421
+ my ($aut_lines, $aut_addrs) = @_;
1422
+
1423
+ # NOTE: Relational classifier features
1424
+ my $rc_features = "";
1425
+
1426
+ # Features will be stored here
1427
+ my $features = "";
1428
+ # First word in line
1429
+ my $is_first_line = undef;
1430
+ # First word in run
1431
+ # my $is_first_run = undef;
1432
+
1433
+ # Font size
1434
+ my %fonts = ();
1435
+ # Each line contains many runs
1436
+ foreach my $line (@{ $aut_lines })
1437
+ {
1438
+ my $runs = $line->get_objs_ref();
1439
+ # Iterator though all work in all lines
1440
+ foreach my $run (@{ $runs })
1441
+ {
1442
+ my $fsize = $run->get_font_size();
1443
+ my $words = $run->get_objs_ref();
1444
+
1445
+ # Statistic
1446
+ if (! exists $fonts{ $fsize })
1447
+ {
1448
+ $fonts{ $fsize } = scalar(@{ $words });
1449
+ }
1450
+ else
1451
+ {
1452
+ $fonts{ $fsize } += scalar(@{ $words });
1453
+ }
1454
+ }
1455
+ }
1456
+
1457
+ my $dominate_font = undef;
1458
+ # Sort all the font descend with the number of their appearance
1459
+ my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
1460
+ # Select the dominated font
1461
+ $dominate_font = $sorted[ 0 ];
1462
+
1463
+ my $size_mismatch = undef;
1464
+ # TODO: serious error if the size of aut_lines and the size of aut_addrs mismatch
1465
+ if (scalar(@{ $aut_lines }) != scalar(@{ $aut_addrs }))
1466
+ {
1467
+ $size_mismatch = 1;
1468
+ # Print the error but still try to continue
1469
+ print STDERR "# Total number of author lines (" . scalar(@{ $aut_lines }) . ") != Total number of author addresses (" . scalar(@{ $aut_addrs }) . ")." . "\n";
1470
+ }
1471
+
1472
+ my $prev_page = undef;
1473
+ my $prev_sect = undef;
1474
+ my $prev_para = undef;
1475
+ # Each line contains many runs
1476
+ for (my $counter = 0; $counter < scalar(@{ $aut_lines }); $counter++)
1477
+ {
1478
+ # Get the line object
1479
+ my $line = $aut_lines->[ $counter ];
1480
+
1481
+ # Check the size of aut_line and aut_addrs
1482
+ if (! defined $size_mismatch)
1483
+ {
1484
+ # Check if two consecutive lines are from two different sections
1485
+ if (! defined $prev_page)
1486
+ {
1487
+ # Init
1488
+ $prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
1489
+ $prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
1490
+ $prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
1491
+ }
1492
+ else
1493
+ {
1494
+ # Authors from different sections will be separated immediately
1495
+ if (($prev_page != $aut_addrs->[ $counter ]->{ 'L1' }) ||
1496
+ ($prev_sect != $aut_addrs->[ $counter ]->{ 'L2' }) ||
1497
+ ($prev_para != $aut_addrs->[ $counter ]->{ 'L3' }))
1498
+ {
1499
+ $features .= "\n";
1500
+
1501
+ # NOTE: Relational classifier features
1502
+ $rc_features .= "\n";
1503
+ }
1504
+
1505
+ # Save the paragraph index
1506
+ $prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
1507
+ $prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
1508
+ $prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
1509
+ }
1510
+ }
1511
+
1512
+ # Set first word in line
1513
+ $is_first_line = 1;
1514
+
1515
+ # Previous word and the word before this
1516
+ my $prev_prev_word = undef;
1517
+ my $prev_word = undef;
1518
+
1519
+ # Format of the previous word
1520
+ my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
1521
+
1522
+ my $runs = $line->get_objs_ref();
1523
+ # Iterator though all work in all lines
1524
+ foreach my $run (@{ $runs })
1525
+ {
1526
+ # The run must be non-empty
1527
+ my $tmp = $run->get_content();
1528
+ # Trim
1529
+ $tmp =~ s/^\s+|\s+$//g;
1530
+ # Skip blank run
1531
+ if ($tmp eq "") { next; }
1532
+
1533
+ # Set first word in run
1534
+ # $is_first_run = 1;
1535
+
1536
+ ###
1537
+ # The following features are XML features
1538
+ ###
1539
+
1540
+ # Bold format
1541
+ my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
1542
+
1543
+ # Italic format
1544
+ my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
1545
+
1546
+ # Underline
1547
+ my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
1548
+
1549
+ # Sub-Sup-script
1550
+ my $suscript = ($run->get_suscript() eq "superscript") ? "super" :
1551
+ ($run->get_suscript() eq "subscript") ? "sub" : "none";
1552
+
1553
+ # Relative font size
1554
+ my $fontsize = ($run->get_font_size() > $dominate_font) ? "large" :
1555
+ ($run->get_font_size() < $dominate_font) ? "small" : "normal";
1556
+
1557
+ ###
1558
+ # End of XML features
1559
+ ###
1560
+
1561
+ # All words in the run
1562
+ my $words = $run->get_objs_ref();
1563
+
1564
+ # For each word
1565
+ foreach my $word (@{ $words })
1566
+ {
1567
+ # Get word location
1568
+ my $top = $word->get_top_pos();
1569
+ my $bottom = $word->get_bottom_pos();
1570
+ my $left = $word->get_left_pos();
1571
+ my $right = $word->get_right_pos();
1572
+
1573
+ # NOTE: heuristic rule, for words in the same line
1574
+ # If the x-axis distance between this word and the previous word is
1575
+ # three times larger than the distance between the previous word and
1576
+ # the word before it, then it marks the separator.
1577
+ # The better way to do this is to introduce it as a new feature in the
1578
+ # author and affiliation model but this step requires re-training these
1579
+ # two models, so ...
1580
+ #
1581
+ # NOTE: Assuming left to right writing
1582
+ if (! defined $prev_word)
1583
+ {
1584
+ $prev_word = $word;
1585
+ }
1586
+ elsif (! defined $prev_prev_word)
1587
+ {
1588
+ # NOTE: Words have the power to both destroy and heal, when words are both
1589
+ # true and kind, they can change our world
1590
+ if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1591
+ {
1592
+ $prev_prev_word = $prev_word;
1593
+ $prev_word = $word;
1594
+ }
1595
+ }
1596
+ else
1597
+ {
1598
+ # NOTE: Words have the power to both destroy and heal, when words are both
1599
+ # true and kind, they can change our world
1600
+ if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
1601
+ {
1602
+
1603
+ my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
1604
+ my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
1605
+
1606
+ if ($prev_dist * 5 < $curr_dist)
1607
+ {
1608
+ $features .= "\n";
1609
+
1610
+ # NOTE: Relational classifier features
1611
+ $rc_features .= "\n";
1612
+ }
1613
+
1614
+ $prev_prev_word = $prev_word;
1615
+ $prev_word = $word;
1616
+ }
1617
+ }
1618
+
1619
+ # Extract features
1620
+ my $full_content = $word->get_content();
1621
+ # Trim
1622
+ $full_content =~ s/^\s+|\s+$//g;
1623
+
1624
+ # Skip blank run
1625
+ if ($full_content eq "") { next; }
1626
+
1627
+ my @sub_content = ();
1628
+ # This is the tricky part, one word e.g. name** will be splitted
1629
+ # into several parts: the name, the signal, and the separator if
1630
+ # possible using regular expression
1631
+ while ($full_content =~ m/([\w|-]*)(\W*)/g)
1632
+ {
1633
+ my $first = $1;
1634
+ my $second = $2;
1635
+
1636
+ # Trim
1637
+ $first =~ s/^\s+|\s+$//g;
1638
+ $second =~ s/^\s+|\s+$//g;
1639
+
1640
+ # Only keep non-blank content
1641
+ if ($first ne "") { push @sub_content, $first; }
1642
+
1643
+ # Check the signal and separator
1644
+ while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
1645
+ {
1646
+ my $sub_first = $1;
1647
+ my $sub_second = $2;
1648
+
1649
+ # Trim
1650
+ $sub_first =~ s/^\s+|\s+$//g;
1651
+ $sub_second =~ s/^\s+|\s+$//g;
1652
+
1653
+ # Only keep non-blank separator
1654
+ if ($sub_first ne "") { push @sub_content, $sub_first; }
1655
+ # Only keep non-blank signal
1656
+ if ($sub_second ne "") { push @sub_content, $sub_second; }
1657
+ }
1658
+ }
1659
+
1660
+ foreach my $content (@sub_content)
1661
+ {
1662
+ # Content
1663
+ $features .= $content . "\t";
1664
+
1665
+ my $content_n = $content;
1666
+ # Remove punctuation
1667
+ $content_n =~ s/[^\w]//g;
1668
+ # Lower case
1669
+ my $content_l = lc($content);
1670
+ # Lower case, no punctuation
1671
+ my $content_nl = lc($content_n);
1672
+ # Lower case
1673
+ $features .= $content_l . "\t";
1674
+ # Lower case, no punctuation
1675
+ if ($content_nl ne "")
1676
+ {
1677
+ $features .= $content_nl . "\t";
1678
+ }
1679
+ else
1680
+ {
1681
+ $features .= $content_l . "\t";
1682
+ }
1683
+
1684
+ # Capitalization
1685
+ my $ortho = ($content =~ /^[\p{IsUpper}]$/) ? "single" :
1686
+ ($content =~ /^[\p{IsUpper}][\p{IsLower}]+/) ? "init" :
1687
+ ($content =~ /^[\p{IsUpper}]+$/) ? "all" : "others";
1688
+ $features .= $ortho . "\t";
1689
+
1690
+ # Numeric property
1691
+ my $num = ($content =~ /^[0-9]$/) ? "1dig" :
1692
+ ($content =~ /^[0-9][0-9]$/) ? "2dig" :
1693
+ ($content =~ /^[0-9][0-9][0-9]$/) ? "3dig" :
1694
+ ($content =~ /^[0-9]+$/) ? "4+dig" :
1695
+ ($content =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal" :
1696
+ ($content =~ /[0-9]/) ? "hasdig" : "nonnum";
1697
+ $features .= $num . "\t";
1698
+
1699
+ # Last punctuation
1700
+ my $punct = ($content =~ /^[\"\'\`]/) ? "leadq" :
1701
+ ($content =~ /[\"\'\`][^s]?$/) ? "endq" :
1702
+ ($content =~ /\-.*\-/) ? "multi" :
1703
+ ($content =~ /[\-\,\:\;]$/) ? "cont" :
1704
+ ($content =~ /[\!\?\.\"\']$/) ? "stop" :
1705
+ ($content =~ /^[\(\[\{\<].+[\)\]\}\>].?$/) ? "braces" : "others";
1706
+ $features .= $punct . "\t";
1707
+
1708
+ # Split into character
1709
+ my @chars = split(//, $content);
1710
+ my $clen = scalar @chars;
1711
+ # Content length
1712
+ my $length = (scalar(@chars) == 1) ? "1-char" :
1713
+ (scalar(@chars) == 2) ? "2-char" :
1714
+ (scalar(@chars) == 3) ? "3-char" : "4+char";
1715
+ $features .= $length . "\t";
1716
+ # First n-gram
1717
+ $features .= $chars[ 0 ] . "\t";
1718
+ if ($clen >= 2) {
1719
+ $features .= join("", @chars[ 0..1 ]) . "\t";
1720
+ } else {
1721
+ $features .= $length . "\t";
1722
+ }
1723
+ if ($clen >= 3) {
1724
+ $features .= join("", @chars[ 0..2 ]) . "\t";
1725
+ } elsif ($clen >= 2) {
1726
+ $features .= join("", @chars[ 0..1 ]) . "\t";
1727
+ } else {
1728
+ $features .= $length . "\t";
1729
+ }
1730
+ if ($clen >= 4) {
1731
+ $features .= join("", @chars[ 0..3 ]) . "\t";
1732
+ } elsif ($clen >= 3) {
1733
+ $features .= join("", @chars[ 0..2 ]) . "\t";
1734
+ } elsif ($clen >= 2) {
1735
+ $features .= join("", @chars[ 0..1 ]) . "\t";
1736
+ } else {
1737
+ $features .= $length . "\t";
1738
+ }
1739
+ # Last n-gram
1740
+ $features .= $chars[ -1 ] . "\t";
1741
+ if ($clen >= 2) {
1742
+ $features .= join("", @chars[ -2..-1 ]) . "\t";
1743
+ } else {
1744
+ $features .= $chars[ -1 ] . "\t";
1745
+ }
1746
+ if ($clen >= 3) {
1747
+ $features .= join("", @chars[ -3..-1 ]) . "\t";
1748
+ } elsif ($clen >= 2) {
1749
+ $features .= join("", @chars[ -2..-1 ]) . "\t";
1750
+ } else {
1751
+ $features .= $chars[ -1 ] . "\t";
1752
+ }
1753
+ if ($clen >= 4) {
1754
+ $features .= join("", @chars[ -4..-1 ]) . "\t";
1755
+ } elsif ($clen >= 3) {
1756
+ $features .= join("", @chars[ -3..-1 ]) . "\t";
1757
+ } elsif ($clen >= 2) {
1758
+ $features .= join("", @chars[ -2..-1 ]) . "\t";
1759
+ } else {
1760
+ $features .= $chars[ -1 ] . "\t";
1761
+ }
1762
+
1763
+ # Dictionary
1764
+ my $dict_status = (defined $dict{ $content_nl }) ? $dict{ $content_nl } : 0;
1765
+ # Possible names
1766
+ my ($publisher_name, $place_name, $month_name, $last_name, $female_name, $male_name) = undef;
1767
+ # Check all case
1768
+ if ($dict_status >= 32) { $dict_status -= 32; $publisher_name = "publisher" } else { $publisher_name = "no"; }
1769
+ if ($dict_status >= 16) { $dict_status -= 16; $place_name = "place" } else { $place_name = "no"; }
1770
+ if ($dict_status >= 8) { $dict_status -= 8; $month_name = "month" } else { $month_name = "no"; }
1771
+ if ($dict_status >= 4) { $dict_status -= 4; $last_name = "last" } else { $last_name = "no"; }
1772
+ if ($dict_status >= 2) { $dict_status -= 2; $female_name = "female" } else { $female_name = "no"; }
1773
+ if ($dict_status >= 1) { $dict_status -= 1; $male_name = "male" } else { $male_name = "no"; }
1774
+ # Save the feature
1775
+ $features .= $male_name . "\t";
1776
+ $features .= $female_name . "\t";
1777
+ $features .= $last_name . "\t";
1778
+ $features .= $month_name . "\t";
1779
+ $features .= $place_name . "\t";
1780
+ $features .= $publisher_name . "\t";
1781
+
1782
+ # First word in line
1783
+ if ($is_first_line == 1)
1784
+ {
1785
+ $features .= "begin" . "\t";
1786
+
1787
+ # Next words are not the first in line anymore
1788
+ $is_first_line = 0;
1789
+ }
1790
+ else
1791
+ {
1792
+ $features .= "continue" . "\t";
1793
+ }
1794
+
1795
+ ###
1796
+ # The following features are XML features
1797
+ ###
1798
+
1799
+ # Bold format
1800
+ $features .= $bold . "\t";
1801
+
1802
+ # Italic format
1803
+ $features .= $italic . "\t";
1804
+
1805
+ # Underline
1806
+ $features .= $underline . "\t";
1807
+
1808
+ # Sub-Sup-script
1809
+ $features .= $suscript . "\t";
1810
+
1811
+ # Relative font size
1812
+ $features .= $fontsize . "\t";
1813
+
1814
+ # First word in run
1815
+ if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
1816
+ {
1817
+ $features .= "fbegin" . "\t";
1818
+
1819
+ # Next words are not the first in line anymore
1820
+ # $is_first_run = 0;
1821
+ }
1822
+ else
1823
+ {
1824
+ $features .= "fcontinue" . "\t";
1825
+ }
1826
+
1827
+ # New token
1828
+ $features .= "\n";
1829
+
1830
+ # Save the XML format
1831
+ $prev_bold = $bold;
1832
+ $prev_italic = $italic;
1833
+ $prev_underline = $underline;
1834
+ $prev_suscript = $suscript;
1835
+ $prev_fontsize = $fontsize;
1836
+
1837
+ # NOTE: Relational classifier features
1838
+ # Content
1839
+ $rc_features .= $content . "\t";
1840
+ # Location
1841
+ $rc_features .= $top . "\t";
1842
+ $rc_features .= $bottom . "\t";
1843
+ $rc_features .= $left . "\t";
1844
+ $rc_features .= $right . "\t";
1845
+ # Index
1846
+ if (! defined $size_mismatch)
1847
+ {
1848
+ $rc_features .= $aut_addrs->[ $counter ]->{ 'L1' } . "\t";
1849
+ $rc_features .= $aut_addrs->[ $counter ]->{ 'L2' } . "\t";
1850
+ $rc_features .= $aut_addrs->[ $counter ]->{ 'L3' } . "\t";
1851
+ $rc_features .= $aut_addrs->[ $counter ]->{ 'L4' } . "\t";
1852
+ }
1853
+ # Done
1854
+ $rc_features .= "\n";
1855
+ }
1856
+ }
1857
+ }
1858
+ }
1859
+
1860
+ return ($features, $rc_features);
1861
+ }
1862
+
1863
+ sub ReadDict
1864
+ {
1865
+ my ($dictfile) = @_;
1866
+
1867
+ # Absolute path
1868
+ my $dictfile_abs = File::Spec->rel2abs($dictfile);
1869
+ # Dictionary handle
1870
+ my $dict_handle = undef;
1871
+ open ($dict_handle, "<:utf8", $dictfile_abs) || die "Could not open dict file $dictfile_abs: $!";
1872
+
1873
+ my $mode = 0;
1874
+ while (<$dict_handle>)
1875
+ {
1876
+ if (/^\#\# Male/) { $mode = 1; } # male names
1877
+ elsif (/^\#\# Female/) { $mode = 2; } # female names
1878
+ elsif (/^\#\# Last/) { $mode = 4; } # last names
1879
+ elsif (/^\#\# Chinese/) { $mode = 4; } # last names
1880
+ elsif (/^\#\# Months/) { $mode = 8; } # month names
1881
+ elsif (/^\#\# Place/) { $mode = 16; } # place names
1882
+ elsif (/^\#\# Publisher/) { $mode = 32; } # publisher names
1883
+ elsif (/^\#/) { next; }
1884
+ else
1885
+ {
1886
+ chop;
1887
+ my $key = $_;
1888
+ my $val = 0;
1889
+ # Has probability
1890
+ if (/\t/) { ($key,$val) = split (/\t/,$_); }
1891
+
1892
+ # Already tagged (some entries may appear in same part of lexicon more than once
1893
+ if (! exists $dict{ $key })
1894
+ {
1895
+ $dict{ $key } = $mode;
1896
+ }
1897
+ else
1898
+ {
1899
+ if ($dict{ $key } >= $mode)
1900
+ {
1901
+ next;
1902
+ }
1903
+ # Not yet tagged
1904
+ else
1905
+ {
1906
+ $dict{ $key } += $mode;
1907
+ }
1908
+ }
1909
+ }
1910
+ }
1911
+
1912
+ close ($dict_handle);
1913
+ }
1914
+
1915
+ sub BuildTmpFile
1916
+ {
1917
+ my ($filename) = @_;
1918
+
1919
+ my $tmpfile = $filename;
1920
+ $tmpfile =~ s/[\.\/]//g;
1921
+ $tmpfile .= $$ . time;
1922
+
1923
+ # Untaint tmpfile variable
1924
+ if ($tmpfile =~ /^([-\@\w.]+)$/)
1925
+ {
1926
+ $tmpfile = $1;
1927
+ }
1928
+
1929
+ return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
1930
+ }
1931
+
1932
+ 1;
1933
+
1934
+
1935
+
1936
+
1937
+
1938
+
1939
+
1940
+
1941
+
1942
+
1943
+
1944
+
1945
+
1946
+
1947
+
1948
+
1949
+