opener-opinion-detector-base 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +101 -0
  3. data/bin/opinion-detector-base +19 -0
  4. data/core/annotation.cfg.erb +9 -0
  5. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  6. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  7. data/core/python-scripts/LICENSE +339 -0
  8. data/core/python-scripts/README.md +226 -0
  9. data/core/python-scripts/classify_kaf_naf_file.py +499 -0
  10. data/core/python-scripts/cross_validation.py +634 -0
  11. data/core/python-scripts/generate_folds.py +134 -0
  12. data/core/python-scripts/models.cfg +10 -0
  13. data/core/python-scripts/my_templates/README +33 -0
  14. data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
  15. data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
  16. data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
  17. data/core/python-scripts/my_templates/templates_exp.txt +10 -0
  18. data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
  19. data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
  20. data/core/python-scripts/my_templates/templates_holder.txt +10 -0
  21. data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
  22. data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
  23. data/core/python-scripts/my_templates/templates_target.txt +10 -0
  24. data/core/python-scripts/run_all_experiments.sh +49 -0
  25. data/core/python-scripts/run_basic.py +20 -0
  26. data/core/python-scripts/run_experiment.sh +42 -0
  27. data/core/python-scripts/scripts/__init__.py +1 -0
  28. data/core/python-scripts/scripts/config_manager.py +314 -0
  29. data/core/python-scripts/scripts/crfutils.py +215 -0
  30. data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
  31. data/core/python-scripts/scripts/extract_features.py +376 -0
  32. data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
  33. data/core/python-scripts/scripts/lexicons.py +44 -0
  34. data/core/python-scripts/scripts/link_entities_distance.py +77 -0
  35. data/core/python-scripts/scripts/relation_classifier.py +250 -0
  36. data/core/python-scripts/train.py +566 -0
  37. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
  38. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
  39. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
  40. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
  41. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
  42. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
  43. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
  44. data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
  45. data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
  46. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
  47. data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
  48. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
  49. data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
  50. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
  51. data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
  52. data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
  53. data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
  54. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
  55. data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
  56. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
  57. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
  58. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
  59. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
  60. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
  61. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
  62. data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
  63. data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
  64. data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
  65. data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
  66. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
  67. data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
  68. data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
  69. data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
  70. data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
  71. data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
  72. data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
  73. data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
  74. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
  75. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
  76. data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
  77. data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
  78. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
  79. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
  80. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
  81. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
  82. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
  83. data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
  84. data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
  85. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
  86. data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
  87. data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
  88. data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
  89. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
  90. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
  91. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
  92. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
  93. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
  94. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
  95. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
  96. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
  97. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
  98. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
  99. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
  100. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
  101. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  102. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  103. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  104. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  105. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  106. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  107. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  108. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  109. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  110. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  111. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  112. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  113. data/core/vendor/src/crfsuite/COPYING +27 -0
  114. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  115. data/core/vendor/src/crfsuite/INSTALL +236 -0
  116. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  117. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  118. data/core/vendor/src/crfsuite/README +183 -0
  119. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  120. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  121. data/core/vendor/src/crfsuite/compile +143 -0
  122. data/core/vendor/src/crfsuite/config.guess +1502 -0
  123. data/core/vendor/src/crfsuite/config.h.in +198 -0
  124. data/core/vendor/src/crfsuite/config.sub +1714 -0
  125. data/core/vendor/src/crfsuite/configure +14273 -0
  126. data/core/vendor/src/crfsuite/configure.in +149 -0
  127. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  128. data/core/vendor/src/crfsuite/depcomp +630 -0
  129. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  130. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  131. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  132. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  133. data/core/vendor/src/crfsuite/example/template.py +88 -0
  134. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  135. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  136. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  137. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  138. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  139. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  140. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  141. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  142. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  143. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  144. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  145. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  146. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  147. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  148. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  149. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  150. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  151. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  152. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  153. data/core/vendor/src/crfsuite/include/os.h +61 -0
  154. data/core/vendor/src/crfsuite/install-sh +520 -0
  155. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  156. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  157. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  158. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  159. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  160. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  161. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  162. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  163. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  164. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  165. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  166. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  167. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  168. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  169. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  170. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  171. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  172. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  173. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  174. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  175. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  176. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  177. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  178. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  179. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  180. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  181. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  182. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  183. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  184. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  185. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  186. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  187. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  188. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  189. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  190. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  191. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  192. data/core/vendor/src/crfsuite/missing +376 -0
  193. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  194. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  195. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  196. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  197. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  198. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  199. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  200. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  201. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  202. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  203. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  204. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  205. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  206. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  207. data/core/vendor/src/liblbfgs/COPYING +22 -0
  208. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  209. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  210. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  211. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  212. data/core/vendor/src/liblbfgs/NEWS +0 -0
  213. data/core/vendor/src/liblbfgs/README +71 -0
  214. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  215. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  216. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  217. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  218. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  219. data/core/vendor/src/liblbfgs/configure +21146 -0
  220. data/core/vendor/src/liblbfgs/configure.in +107 -0
  221. data/core/vendor/src/liblbfgs/depcomp +522 -0
  222. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  223. data/core/vendor/src/liblbfgs/install-sh +322 -0
  224. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  225. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  226. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  227. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  228. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  229. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  230. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  231. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  232. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  233. data/core/vendor/src/liblbfgs/missing +353 -0
  234. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  235. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  236. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  237. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  238. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  239. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  240. data/core/vendor/src/svm_light/Makefile +105 -0
  241. data/core/vendor/src/svm_light/kernel.h +40 -0
  242. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  243. data/core/vendor/src/svm_light/svm_common.c +985 -0
  244. data/core/vendor/src/svm_light/svm_common.h +301 -0
  245. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  246. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  247. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  248. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  249. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  250. data/ext/hack/Rakefile +17 -0
  251. data/ext/hack/support.rb +88 -0
  252. data/lib/opener/opinion_detectors/base.rb +112 -0
  253. data/lib/opener/opinion_detectors/base/version.rb +7 -0
  254. data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
  255. data/lib/opener/opinion_detectors/de.rb +7 -0
  256. data/lib/opener/opinion_detectors/en.rb +7 -0
  257. data/lib/opener/opinion_detectors/it.rb +7 -0
  258. data/lib/opener/opinion_detectors/nl.rb +6 -0
  259. data/opener-opinion-detector-base.gemspec +35 -0
  260. data/pre_build_requirements.txt +3 -0
  261. metadata +374 -0
@@ -0,0 +1,587 @@
1
+ /*
2
+ * Constant Quark Database (CQDB).
3
+ *
4
+ * Copyright (c) 2007, Naoaki Okazaki
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ * * Neither the name of the Northwestern University, University of Tokyo,
15
+ * nor the names of its contributors may be used to endorse or promote
16
+ * products derived from this software without specific prior written
17
+ * permission.
18
+ *
19
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
23
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
+ */
31
+
32
+ /* $Id$ */
33
+
34
+ #include <stdio.h>
35
+ #include <stdlib.h>
36
+ #include <stdint.h>
37
+ #include <string.h>
38
+
39
+ #include <cqdb.h>
40
+
41
+ #define CHUNKID "CQDB"
42
+ #define BYTEORDER_CHECK (0x62445371)
43
+ #define NUM_TABLES (256)
44
+ #define OFFSET_REFS (0 + sizeof(header_t))
45
+ #define OFFSET_DATA (OFFSET_REFS + sizeof(tableref_t) * NUM_TABLES)
46
+
47
+ /**
48
+ * An element of a hash table.
49
+ */
50
+ typedef struct {
51
+ uint32_t hash; /**< Hash value of the record. */
52
+ uint32_t offset; /**< Offset address to the actual record. */
53
+ } bucket_t;
54
+
55
+ /**
56
+ * A hash table.
57
+ */
58
+ typedef struct {
59
+ uint32_t num; /**< Number of elements in the table. */
60
+ uint32_t size; /**< Maximum number of elements. */
61
+ bucket_t* bucket; /**< Bucket (array of bucket_t). */
62
+ } table_t;
63
+
64
+ /**
65
+ * CQDB chunk header.
66
+ */
67
+ typedef struct {
68
+ int8_t chunkid[4]; /**< Chunk identifier, "CQDB". */
69
+ uint32_t size; /**< Chunk size including this header. */
70
+ uint32_t flag; /**< Global flags. */
71
+ uint32_t byteorder; /**< Byte-order indicator. */
72
+ uint32_t bwd_size; /**< Number of elements in the backward array. */
73
+ uint32_t bwd_offset; /**< Offset to the backward array. */
74
+ } header_t;
75
+
76
+ /**
77
+ * Reference to a hash table.
78
+ */
79
+ typedef struct {
80
+ uint32_t offset; /**< Offset to a hash table. */
81
+ uint32_t num; /**< Number of elements in the hash table. */
82
+ } tableref_t;
83
+
84
+ /**
85
+ * Writer for a constant quark database.
86
+ */
87
+ struct tag_cqdb_writer {
88
+ uint32_t flag; /**< Operation flag. */
89
+ FILE* fp; /**< File pointer. */
90
+ uint32_t begin; /**< Offset address to the head of this database. */
91
+ uint32_t cur; /**< Offset address to a new key/data pair. */
92
+ table_t ht[NUM_TABLES]; /**< Hash tables (string -> id). */
93
+
94
+ uint32_t* bwd; /**< Backlink array. */
95
+ uint32_t bwd_num; /**< */
96
+ uint32_t bwd_size; /**< Number of elements in the backlink array. */
97
+ };
98
+
99
+ /**
100
+ * Constant quark database (CQDB).
101
+ */
102
+ struct tag_cqdb {
103
+ uint8_t* buffer; /**< Pointer to the memory block. */
104
+ size_t size; /**< Size of the memory block. */
105
+
106
+ header_t header; /**< Chunk header. */
107
+ table_t ht[NUM_TABLES]; /**< Hash tables (string -> id). */
108
+
109
+ uint32_t* bwd; /**< Array for backward look-up (id -> string). */
110
+
111
+ int num; /**< Number of key/data pairs. */
112
+ };
113
+
114
+
115
+ uint32_t hashlittle(const void *key, size_t length, uint32_t initval);
116
+
117
+
118
+
119
+
120
+ static size_t write_uint32(cqdb_writer_t* wt, uint32_t value)
121
+ {
122
+ uint8_t buffer[4];
123
+ buffer[0] = (uint8_t)(value & 0xFF);
124
+ buffer[1] = (uint8_t)(value >> 8);
125
+ buffer[2] = (uint8_t)(value >> 16);
126
+ buffer[3] = (uint8_t)(value >> 24);
127
+ return fwrite(buffer, sizeof(uint8_t), 4, wt->fp) / sizeof(value);
128
+ }
129
+
130
+ static size_t write_data(cqdb_writer_t* wt, const void *data, size_t size)
131
+ {
132
+ return fwrite(data, size, 1, wt->fp);
133
+ }
134
+
135
+ cqdb_writer_t* cqdb_writer(FILE *fp, int flag)
136
+ {
137
+ int i;
138
+ cqdb_writer_t* dbw = (cqdb_writer_t*)calloc(1, sizeof(cqdb_writer_t));
139
+
140
+ if (dbw != NULL) {
141
+ /* Initialize cqdb_writer_t members. */
142
+ memset(dbw, 0, sizeof(*dbw));
143
+ dbw->flag = flag;
144
+ dbw->fp = fp;
145
+ dbw->begin = ftell(dbw->fp);
146
+ dbw->cur = OFFSET_DATA;
147
+
148
+ /* Initialize the hash tables.*/
149
+ for (i = 0;i < NUM_TABLES;++i) {
150
+ dbw->ht[i].bucket = NULL;
151
+ }
152
+
153
+ dbw->bwd = NULL;
154
+ dbw->bwd_num = 0;
155
+ dbw->bwd_size = 0;
156
+
157
+ /* Move the file pointer to the offset to the first key/data pair. */
158
+ if (fseek(dbw->fp, dbw->begin + dbw->cur, SEEK_SET) != 0) {
159
+ goto error_exit; /* Seek error. */
160
+ }
161
+ }
162
+
163
+ return dbw;
164
+
165
+ error_exit:
166
+ free(dbw);
167
+ return NULL;
168
+ }
169
+
170
+ static int cqdb_writer_delete(cqdb_writer_t* dbw)
171
+ {
172
+ int i;
173
+
174
+ /* Free allocated memory blocks. */
175
+ for (i = 0;i < NUM_TABLES;++i) {
176
+ free(dbw->ht[i].bucket);
177
+ }
178
+ free(dbw->bwd);
179
+ free(dbw);
180
+ return 0;
181
+ }
182
+
183
+ int cqdb_writer_put(cqdb_writer_t* dbw, const char *str, int id)
184
+ {
185
+ int ret = 0;
186
+ const void *key = str;
187
+ uint32_t ksize = (uint32_t)(strlen(str) + 1);
188
+
189
+ /* Compute the hash value and choose a hash table. */
190
+ uint32_t hv = hashlittle(key, ksize, 0);
191
+ table_t* ht = &dbw->ht[hv % 256];
192
+
193
+ /* Check for non-negative identifier. */
194
+ if (id < 0) {
195
+ ret = CQDB_ERROR_INVALIDID;
196
+ goto error_exit;
197
+ }
198
+
199
+ /* Write out the current data. */
200
+ write_uint32(dbw, (uint32_t)id);
201
+ write_uint32(dbw, (uint32_t)ksize);
202
+ write_data(dbw, key, ksize);
203
+ if (ferror(dbw->fp)) {
204
+ ret = CQDB_ERROR_FILEWRITE;
205
+ goto error_exit;
206
+ }
207
+
208
+ /* Expand the bucket if necessary. */
209
+ if (ht->size <= ht->num) {
210
+ ht->size = (ht->size+1) * 2;
211
+ ht->bucket = (bucket_t*)realloc(ht->bucket, sizeof(bucket_t) * ht->size);
212
+ if (ht->bucket == NULL) {
213
+ ret = CQDB_ERROR_OUTOFMEMORY;
214
+ goto error_exit;
215
+ }
216
+ }
217
+
218
+ /* Set the hash value and current offset position. */
219
+ ht->bucket[ht->num].hash = hv;
220
+ ht->bucket[ht->num].offset = dbw->cur;
221
+ ++ht->num;
222
+
223
+ /* Store the backlink if specified. */
224
+ if (!(dbw->flag & CQDB_ONEWAY)) {
225
+ /* Expand the backlink array if necessary. */
226
+ if (dbw->bwd_size <= (uint32_t)id) {
227
+ uint32_t size = dbw->bwd_size;
228
+
229
+ while (size <= (uint32_t)id) size = (size + 1) * 2;
230
+ dbw->bwd = (uint32_t*)realloc(dbw->bwd, sizeof(uint32_t) * size);
231
+ if (dbw->bwd == NULL) {
232
+ ret = CQDB_ERROR_OUTOFMEMORY;
233
+ goto error_exit;
234
+ }
235
+ while (dbw->bwd_size < size) {
236
+ dbw->bwd[dbw->bwd_size++] = 0;
237
+ }
238
+ }
239
+
240
+ if (dbw->bwd_num <= (uint32_t)id) {
241
+ dbw->bwd_num = (uint32_t)id+1;
242
+ }
243
+
244
+ dbw->bwd[id] = dbw->cur;
245
+ }
246
+
247
+ /* Increment the current position. */
248
+ dbw->cur += sizeof(uint32_t) + sizeof(uint32_t) + ksize;
249
+ return 0;
250
+
251
+ error_exit:
252
+ dbw->flag |= CQDB_ERROR_OCCURRED;
253
+ return ret;
254
+ }
255
+
256
+ int cqdb_writer_close(cqdb_writer_t* dbw)
257
+ {
258
+ uint32_t i, j;
259
+ int k, ret = 0;
260
+ long offset = 0;
261
+ header_t header;
262
+
263
+ /* If an error have occurred, just free the memory blocks. */
264
+ if (dbw->flag & CQDB_ERROR_OCCURRED) {
265
+ cqdb_writer_delete(dbw);
266
+ return 0;
267
+ }
268
+
269
+ /* Initialize the file header. */
270
+ strncpy((char*)header.chunkid, CHUNKID, 4);
271
+ header.byteorder = BYTEORDER_CHECK;
272
+ header.bwd_offset = 0;
273
+ header.bwd_size = dbw->bwd_num;
274
+
275
+ /*
276
+ Store the hash tables. At this moment, the file pointer refers to
277
+ the offset succeeding the last key/data pair.
278
+ */
279
+ for (i = 0;i < NUM_TABLES;++i) {
280
+ table_t* ht = &dbw->ht[i];
281
+
282
+ /* Do not write empty hash tables. */
283
+ if (ht->bucket != NULL) {
284
+ /*
285
+ Actual bucket will have the double size; half elements
286
+ in the bucket are kept empty.
287
+ */
288
+ int n = ht->num * 2;
289
+
290
+ /* Allocate the bucket. */
291
+ bucket_t* dst = (bucket_t*)calloc(n, sizeof(bucket_t));
292
+ if (dst == NULL) {
293
+ ret = CQDB_ERROR_OUTOFMEMORY;
294
+ goto error_exit;
295
+ }
296
+
297
+ /*
298
+ Put hash elements to the bucket with the open-address method.
299
+ */
300
+ for (j = 0;j < ht->num;++j) {
301
+ const bucket_t* src = &ht->bucket[j];
302
+ int k = (src->hash >> 8) % n;
303
+
304
+ /* Find a vacant element. */
305
+ while (dst[k].offset != 0) {
306
+ k = (k+1) % n;
307
+ }
308
+
309
+ /* Store the hash element. */
310
+ dst[k].hash = src->hash;
311
+ dst[k].offset = src->offset;
312
+ }
313
+
314
+ /* Write the bucket. */
315
+ for (k = 0;k < n;++k) {
316
+ write_uint32(dbw, dst[k].hash);
317
+ write_uint32(dbw, dst[k].offset);
318
+ }
319
+
320
+ /* Free the bucket. */
321
+ free(dst);
322
+ }
323
+ }
324
+
325
+ /* Write the backlink array if specified. */
326
+ if (!(dbw->flag & CQDB_ONEWAY) && 0 < dbw->bwd_size) {
327
+ /* Store the offset to the head of this array. */
328
+ header.bwd_offset = ftell(dbw->fp) - dbw->begin;
329
+ /* Store the contents of the backlink array. */
330
+ for (i = 0;i < dbw->bwd_num;++i) {
331
+ write_uint32(dbw, dbw->bwd[i]);
332
+ }
333
+ }
334
+
335
+ /* Check for an occurrence of a file-related error. */
336
+ if (ferror(dbw->fp)) {
337
+ ret = CQDB_ERROR_FILEWRITE;
338
+ goto error_exit;
339
+ }
340
+
341
+ /* Store the current position. */
342
+ offset = ftell(dbw->fp);
343
+ if (offset == -1) {
344
+ ret = CQDB_ERROR_FILETELL;
345
+ goto error_exit;
346
+ }
347
+ header.size = (uint32_t)offset - dbw->begin;
348
+
349
+ /* Rewind the current position to the beginning. */
350
+ if (fseek(dbw->fp, dbw->begin, SEEK_SET) != 0) {
351
+ ret = CQDB_ERROR_FILESEEK;
352
+ goto error_exit;
353
+ }
354
+
355
+ /* Write the file header. */
356
+ write_data(dbw, header.chunkid, 4);
357
+ write_uint32(dbw, header.size);
358
+ write_uint32(dbw, header.flag);
359
+ write_uint32(dbw, header.byteorder);
360
+ write_uint32(dbw, header.bwd_size);
361
+ write_uint32(dbw, header.bwd_offset);
362
+
363
+ /*
364
+ Write references to hash tables. At this moment, dbw->cur points
365
+ to the offset succeeding the last key/data pair.
366
+ */
367
+ for (i = 0;i < NUM_TABLES;++i) {
368
+ /* Offset to the hash table (or zero for non-existent tables). */
369
+ write_uint32(dbw, dbw->ht[i].num ? dbw->cur : 0);
370
+ /* Bucket size is double to the number of elements. */
371
+ write_uint32(dbw, dbw->ht[i].num * 2);
372
+ /* Advance the offset counter. */
373
+ dbw->cur += (dbw->ht[i].num * 2) * sizeof(bucket_t);
374
+ }
375
+
376
+ /* Check an occurrence of a file-related error. */
377
+ if (ferror(dbw->fp)) {
378
+ ret = CQDB_ERROR_FILEWRITE;
379
+ goto error_exit;
380
+ }
381
+
382
+ /* Seek to the last position. */
383
+ if (fseek(dbw->fp, offset, SEEK_SET) != 0) {
384
+ ret = CQDB_ERROR_FILESEEK;
385
+ goto error_exit;
386
+ }
387
+
388
+ cqdb_writer_delete(dbw);
389
+ return ret;
390
+
391
+ error_exit:
392
+ /* Seek to the first position. */
393
+ fseek(dbw->fp, dbw->begin, SEEK_SET);
394
+ cqdb_writer_delete(dbw);
395
+ return ret;
396
+ }
397
+
398
+
399
+
400
+ static uint32_t read_uint32(uint8_t* p)
401
+ {
402
+ uint32_t value;
403
+ value = ((uint32_t)p[0]);
404
+ value |= ((uint32_t)p[1] << 8);
405
+ value |= ((uint32_t)p[2] << 16);
406
+ value |= ((uint32_t)p[3] << 24);
407
+ return value;
408
+ }
409
+
410
+ static uint8_t *read_tableref(tableref_t* ref, uint8_t *p)
411
+ {
412
+ ref->offset = read_uint32(p);
413
+ p += sizeof(uint32_t);
414
+ ref->num = read_uint32(p);
415
+ p += sizeof(uint32_t);
416
+ return p;
417
+ }
418
+
419
+ static bucket_t* read_bucket(uint8_t* p, uint32_t num)
420
+ {
421
+ uint32_t i;
422
+ bucket_t *bucket = (bucket_t*)calloc(num, sizeof(bucket_t));
423
+ for (i = 0;i < num;++i) {
424
+ bucket[i].hash = read_uint32(p);
425
+ p += sizeof(uint32_t);
426
+ bucket[i].offset = read_uint32(p);
427
+ p += sizeof(uint32_t);
428
+ }
429
+ return bucket;
430
+ }
431
+
432
+ static uint32_t* read_backward_links(uint8_t* p, uint32_t num)
433
+ {
434
+ uint32_t i;
435
+ uint32_t *bwd = (uint32_t*)calloc(num, sizeof(uint32_t));
436
+ for (i = 0;i < num;++i) {
437
+ bwd[i] = read_uint32(p);
438
+ p += sizeof(uint32_t);
439
+ }
440
+ return bwd;
441
+ }
442
+
443
+ cqdb_t* cqdb_reader(void *buffer, size_t size)
444
+ {
445
+ int i;
446
+ cqdb_t* db = NULL;
447
+
448
+ /* The minimum size of a valid CQDB is OFFSET_DATA. */
449
+ if (size < OFFSET_DATA) {
450
+ return NULL;
451
+ }
452
+
453
+ /* Check the file chunkid. */
454
+ if (memcmp(buffer, CHUNKID, 4) != 0) {
455
+ return NULL;
456
+ }
457
+
458
+ db = (cqdb_t*)calloc(1, sizeof(cqdb_t));
459
+ if (db != NULL) {
460
+ uint8_t* p = NULL;
461
+
462
+ /* Set memory block and size. */
463
+ db->buffer = buffer;
464
+ db->size = size;
465
+
466
+ /* Read the database header. */
467
+ p = db->buffer;
468
+ strncpy((char*)db->header.chunkid, (const char*)p, 4);
469
+ p += sizeof(uint32_t);
470
+ db->header.size = read_uint32(p);
471
+ p += sizeof(uint32_t);
472
+ db->header.flag = read_uint32(p);
473
+ p += sizeof(uint32_t);
474
+ db->header.byteorder = read_uint32(p);
475
+ p += sizeof(uint32_t);
476
+ db->header.bwd_size = read_uint32(p);
477
+ p += sizeof(uint32_t);
478
+ db->header.bwd_offset = read_uint32(p);
479
+ p += sizeof(uint32_t);
480
+
481
+ /* Check the consistency of byte order. */
482
+ if (db->header.byteorder != BYTEORDER_CHECK) {
483
+ free(db);
484
+ return NULL;
485
+ }
486
+
487
+ /* Check the chunk size. */
488
+ if (size < db->header.size) {
489
+ free(db);
490
+ return NULL;
491
+ }
492
+
493
+ /* Set pointers to the hash tables. */
494
+ db->num = 0; /* Number of records. */
495
+ p = (db->buffer + OFFSET_REFS);
496
+ for (i = 0;i < NUM_TABLES;++i) {
497
+ tableref_t ref;
498
+ p = read_tableref(&ref, p);
499
+ if (ref.offset) {
500
+ /* Set buckets. */
501
+ db->ht[i].bucket = read_bucket(db->buffer + ref.offset, ref.num);
502
+ db->ht[i].num = ref.num;
503
+ } else {
504
+ /* An empty hash table. */
505
+ db->ht[i].bucket = NULL;
506
+ db->ht[i].num = 0;
507
+ }
508
+
509
+ /* The number of records is the half of the table size.*/
510
+ db->num += ref.num / 2;
511
+ }
512
+
513
+ /* Set the pointer to the backlink array if any. */
514
+ if (db->header.bwd_offset) {
515
+ db->bwd = read_backward_links(db->buffer + db->header.bwd_offset, db->num);
516
+ } else {
517
+ db->bwd = NULL;
518
+ }
519
+ }
520
+
521
+ return db;
522
+ }
523
+
524
+ void cqdb_delete(cqdb_t* db)
525
+ {
526
+ int i;
527
+
528
+ if (db != NULL) {
529
+ for (i = 0;i < NUM_TABLES;++i) {
530
+ free(db->ht[i].bucket);
531
+ }
532
+ free(db->bwd);
533
+ free(db);
534
+ }
535
+ }
536
+
537
+ int cqdb_to_id(cqdb_t* db, const char *str)
538
+ {
539
+ uint32_t hv = hashlittle(str, strlen(str)+1, 0);
540
+ int t = hv % 256;
541
+ table_t* ht = &db->ht[t];
542
+
543
+ if (ht->num && ht->bucket != NULL) {
544
+ int n = ht->num;
545
+ int k = (hv >> 8) % n;
546
+ bucket_t* p = NULL;
547
+
548
+ while (p = &ht->bucket[k], p->offset) {
549
+ if (p->hash == hv) {
550
+ int value;
551
+ uint32_t ksize;
552
+ uint8_t *q = db->buffer + p->offset;
553
+ value = (int)read_uint32(q);
554
+ q += sizeof(uint32_t);
555
+ ksize = read_uint32(q);
556
+ q += sizeof(uint32_t);
557
+ if (strcmp(str, (const char *)q) == 0) {
558
+ return value;
559
+ }
560
+ }
561
+ k = (k+1) % n;
562
+ }
563
+ }
564
+
565
+ return CQDB_ERROR_NOTFOUND;
566
+ }
567
+
568
+ const char* cqdb_to_string(cqdb_t* db, int id)
569
+ {
570
+ /* Check if the current database supports the backward look-up. */
571
+ if (db->bwd != NULL && (uint32_t)id < db->header.bwd_size) {
572
+ uint32_t offset = db->bwd[id];
573
+ if (offset) {
574
+ uint8_t *p = db->buffer + offset;
575
+ p += sizeof(uint32_t); /* Skip key data. */
576
+ p += sizeof(uint32_t); /* Skip value size. */
577
+ return (const char *)p;
578
+ }
579
+ }
580
+
581
+ return NULL;
582
+ }
583
+
584
+ int cqdb_num(cqdb_t* db)
585
+ {
586
+ return db->num;
587
+ }