pdf2json 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (473) hide show
  1. data/README.markdown +9 -0
  2. data/bin/.gitkeep +0 -0
  3. data/ext/extconf.rb +30 -0
  4. data/lib/pdf2json.rb +8 -0
  5. data/pdf2json-0.52-source/AUTHORS +24 -0
  6. data/pdf2json-0.52-source/CHANGES +11 -0
  7. data/pdf2json-0.52-source/Makefile +84 -0
  8. data/pdf2json-0.52-source/Makefile.in +84 -0
  9. data/pdf2json-0.52-source/aclocal.m4 +274 -0
  10. data/pdf2json-0.52-source/aconf-win32.h +86 -0
  11. data/pdf2json-0.52-source/aconf.h +42 -0
  12. data/pdf2json-0.52-source/aconf.h.in +41 -0
  13. data/pdf2json-0.52-source/autom4te.cache/output.0 +6908 -0
  14. data/pdf2json-0.52-source/autom4te.cache/requests +76 -0
  15. data/pdf2json-0.52-source/autom4te.cache/traces.0 +466 -0
  16. data/pdf2json-0.52-source/config.log +1259 -0
  17. data/pdf2json-0.52-source/config.status +1050 -0
  18. data/pdf2json-0.52-source/configure +6908 -0
  19. data/pdf2json-0.52-source/configure.ac +93 -0
  20. data/pdf2json-0.52-source/doc/pdffonts.1 +130 -0
  21. data/pdf2json-0.52-source/doc/pdffonts.cat +107 -0
  22. data/pdf2json-0.52-source/doc/pdffonts.hlp +117 -0
  23. data/pdf2json-0.52-source/doc/pdfimages.1 +102 -0
  24. data/pdf2json-0.52-source/doc/pdfimages.cat +92 -0
  25. data/pdf2json-0.52-source/doc/pdfimages.hlp +101 -0
  26. data/pdf2json-0.52-source/doc/pdfinfo.1 +158 -0
  27. data/pdf2json-0.52-source/doc/pdfinfo.cat +119 -0
  28. data/pdf2json-0.52-source/doc/pdfinfo.hlp +129 -0
  29. data/pdf2json-0.52-source/doc/pdftoppm.1 +115 -0
  30. data/pdf2json-0.52-source/doc/pdftoppm.cat +105 -0
  31. data/pdf2json-0.52-source/doc/pdftoppm.hlp +114 -0
  32. data/pdf2json-0.52-source/doc/pdftops.1 +229 -0
  33. data/pdf2json-0.52-source/doc/pdftops.cat +221 -0
  34. data/pdf2json-0.52-source/doc/pdftops.hlp +231 -0
  35. data/pdf2json-0.52-source/doc/pdftotext.1 +137 -0
  36. data/pdf2json-0.52-source/doc/pdftotext.cat +120 -0
  37. data/pdf2json-0.52-source/doc/pdftotext.hlp +133 -0
  38. data/pdf2json-0.52-source/doc/sample-xpdfrc +91 -0
  39. data/pdf2json-0.52-source/doc/xpdf.1 +513 -0
  40. data/pdf2json-0.52-source/doc/xpdf.cat +476 -0
  41. data/pdf2json-0.52-source/doc/xpdf.hlp +489 -0
  42. data/pdf2json-0.52-source/doc/xpdfrc.5 +480 -0
  43. data/pdf2json-0.52-source/doc/xpdfrc.cat +474 -0
  44. data/pdf2json-0.52-source/doc/xpdfrc.hlp +479 -0
  45. data/pdf2json-0.52-source/fofi/.DS_Store +0 -0
  46. data/pdf2json-0.52-source/fofi/FoFiBase.cc +156 -0
  47. data/pdf2json-0.52-source/fofi/FoFiBase.h +57 -0
  48. data/pdf2json-0.52-source/fofi/FoFiBase.o +0 -0
  49. data/pdf2json-0.52-source/fofi/FoFiEncodings.cc +994 -0
  50. data/pdf2json-0.52-source/fofi/FoFiEncodings.h +36 -0
  51. data/pdf2json-0.52-source/fofi/FoFiEncodings.o +0 -0
  52. data/pdf2json-0.52-source/fofi/FoFiTrueType.cc +2027 -0
  53. data/pdf2json-0.52-source/fofi/FoFiTrueType.h +174 -0
  54. data/pdf2json-0.52-source/fofi/FoFiTrueType.o +0 -0
  55. data/pdf2json-0.52-source/fofi/FoFiType1.cc +252 -0
  56. data/pdf2json-0.52-source/fofi/FoFiType1.h +59 -0
  57. data/pdf2json-0.52-source/fofi/FoFiType1.o +0 -0
  58. data/pdf2json-0.52-source/fofi/FoFiType1C.cc +2603 -0
  59. data/pdf2json-0.52-source/fofi/FoFiType1C.h +233 -0
  60. data/pdf2json-0.52-source/fofi/FoFiType1C.o +0 -0
  61. data/pdf2json-0.52-source/fofi/Makefile +70 -0
  62. data/pdf2json-0.52-source/fofi/Makefile.dep +0 -0
  63. data/pdf2json-0.52-source/fofi/Makefile.in +70 -0
  64. data/pdf2json-0.52-source/fofi/libfofi.a +0 -0
  65. data/pdf2json-0.52-source/fofi/vms_make.com +0 -0
  66. data/pdf2json-0.52-source/freetype.win32/.DS_Store +0 -0
  67. data/pdf2json-0.52-source/freetype.win32/include/.DS_Store +0 -0
  68. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftconfig.h +528 -0
  69. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftheader.h +780 -0
  70. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftmodule.h +32 -0
  71. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftoption.h +733 -0
  72. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftstdlib.h +173 -0
  73. data/pdf2json-0.52-source/freetype.win32/include/freetype/freetype.h +3919 -0
  74. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftadvanc.h +179 -0
  75. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftbbox.h +94 -0
  76. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftbdf.h +209 -0
  77. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftbitmap.h +227 -0
  78. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftcache.h +1128 -0
  79. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftchapters.h +103 -0
  80. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftcid.h +166 -0
  81. data/pdf2json-0.52-source/freetype.win32/include/freetype/fterrdef.h +244 -0
  82. data/pdf2json-0.52-source/freetype.win32/include/freetype/fterrors.h +206 -0
  83. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftgasp.h +120 -0
  84. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftglyph.h +613 -0
  85. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftgxval.h +358 -0
  86. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftgzip.h +102 -0
  87. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftimage.h +1313 -0
  88. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftincrem.h +353 -0
  89. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftlcdfil.h +213 -0
  90. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftlist.h +277 -0
  91. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftlzw.h +99 -0
  92. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftmac.h +274 -0
  93. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftmm.h +378 -0
  94. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftmodapi.h +483 -0
  95. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftmoderr.h +155 -0
  96. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftotval.h +203 -0
  97. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftoutln.h +537 -0
  98. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftpfr.h +172 -0
  99. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftrender.h +230 -0
  100. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftsizes.h +159 -0
  101. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftsnames.h +200 -0
  102. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftstroke.h +716 -0
  103. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftsynth.h +80 -0
  104. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftsystem.h +347 -0
  105. data/pdf2json-0.52-source/freetype.win32/include/freetype/fttrigon.h +350 -0
  106. data/pdf2json-0.52-source/freetype.win32/include/freetype/fttypes.h +588 -0
  107. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftwinfnt.h +274 -0
  108. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftxf86.h +83 -0
  109. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/autohint.h +231 -0
  110. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftcalc.h +179 -0
  111. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftdebug.h +250 -0
  112. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftdriver.h +422 -0
  113. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftgloadr.h +168 -0
  114. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftmemory.h +380 -0
  115. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftobjs.h +1428 -0
  116. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftpic.h +67 -0
  117. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftrfork.h +196 -0
  118. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftserv.h +620 -0
  119. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftstream.h +539 -0
  120. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/fttrace.h +139 -0
  121. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftvalid.h +150 -0
  122. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/internal.h +51 -0
  123. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/pcftypes.h +56 -0
  124. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/psaux.h +873 -0
  125. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/pshints.h +712 -0
  126. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svbdf.h +77 -0
  127. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svcid.h +83 -0
  128. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svgldict.h +82 -0
  129. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svgxval.h +72 -0
  130. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svkern.h +51 -0
  131. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svmm.h +104 -0
  132. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svotval.h +55 -0
  133. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svpfr.h +66 -0
  134. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svpostnm.h +79 -0
  135. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svpscmap.h +164 -0
  136. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svpsinfo.h +92 -0
  137. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svsfnt.h +102 -0
  138. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svttcmap.h +106 -0
  139. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svtteng.h +53 -0
  140. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svttglyf.h +67 -0
  141. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svwinfnt.h +50 -0
  142. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svxf86nm.h +55 -0
  143. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/sfnt.h +897 -0
  144. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/t1types.h +270 -0
  145. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/tttypes.h +1543 -0
  146. data/pdf2json-0.52-source/freetype.win32/include/freetype/t1tables.h +504 -0
  147. data/pdf2json-0.52-source/freetype.win32/include/freetype/ttnameid.h +1247 -0
  148. data/pdf2json-0.52-source/freetype.win32/include/freetype/tttables.h +759 -0
  149. data/pdf2json-0.52-source/freetype.win32/include/freetype/tttags.h +107 -0
  150. data/pdf2json-0.52-source/freetype.win32/include/freetype/ttunpat.h +59 -0
  151. data/pdf2json-0.52-source/freetype.win32/include/ft2build.h +39 -0
  152. data/pdf2json-0.52-source/freetype.win32/lib/freetype_a.lib +0 -0
  153. data/pdf2json-0.52-source/goo/.DS_Store +0 -0
  154. data/pdf2json-0.52-source/goo/FixedPoint.cc +118 -0
  155. data/pdf2json-0.52-source/goo/FixedPoint.h +155 -0
  156. data/pdf2json-0.52-source/goo/FixedPoint.o +0 -0
  157. data/pdf2json-0.52-source/goo/GHash.cc +380 -0
  158. data/pdf2json-0.52-source/goo/GHash.h +78 -0
  159. data/pdf2json-0.52-source/goo/GHash.o +0 -0
  160. data/pdf2json-0.52-source/goo/GList.cc +97 -0
  161. data/pdf2json-0.52-source/goo/GList.h +96 -0
  162. data/pdf2json-0.52-source/goo/GList.o +0 -0
  163. data/pdf2json-0.52-source/goo/GMutex.h +49 -0
  164. data/pdf2json-0.52-source/goo/GString.cc +724 -0
  165. data/pdf2json-0.52-source/goo/GString.cc.fixed +718 -0
  166. data/pdf2json-0.52-source/goo/GString.h +136 -0
  167. data/pdf2json-0.52-source/goo/GString.o +0 -0
  168. data/pdf2json-0.52-source/goo/ImgWriter.o +0 -0
  169. data/pdf2json-0.52-source/goo/JpegWriter.o +0 -0
  170. data/pdf2json-0.52-source/goo/Makefile +72 -0
  171. data/pdf2json-0.52-source/goo/Makefile.dep +0 -0
  172. data/pdf2json-0.52-source/goo/Makefile.in +72 -0
  173. data/pdf2json-0.52-source/goo/PNGWriter.o +0 -0
  174. data/pdf2json-0.52-source/goo/gfile.cc +731 -0
  175. data/pdf2json-0.52-source/goo/gfile.h +138 -0
  176. data/pdf2json-0.52-source/goo/gfile.o +0 -0
  177. data/pdf2json-0.52-source/goo/gmem.cc +264 -0
  178. data/pdf2json-0.52-source/goo/gmem.h +79 -0
  179. data/pdf2json-0.52-source/goo/gmem.o +0 -0
  180. data/pdf2json-0.52-source/goo/gmempp.cc +32 -0
  181. data/pdf2json-0.52-source/goo/gmempp.o +0 -0
  182. data/pdf2json-0.52-source/goo/gtypes.h +29 -0
  183. data/pdf2json-0.52-source/goo/libGoo.a +0 -0
  184. data/pdf2json-0.52-source/goo/parseargs.c +190 -0
  185. data/pdf2json-0.52-source/goo/parseargs.h +71 -0
  186. data/pdf2json-0.52-source/goo/parseargs.o +0 -0
  187. data/pdf2json-0.52-source/goo/vms_directory.c +214 -0
  188. data/pdf2json-0.52-source/goo/vms_dirent.h +67 -0
  189. data/pdf2json-0.52-source/goo/vms_make.com +82 -0
  190. data/pdf2json-0.52-source/goo/vms_sys_dirent.h +54 -0
  191. data/pdf2json-0.52-source/goo/vms_unix_time.h +102 -0
  192. data/pdf2json-0.52-source/goo/vms_unix_times.c +42 -0
  193. data/pdf2json-0.52-source/goo/vms_unlink.c +22 -0
  194. data/pdf2json-0.52-source/ms_make.bat +199 -0
  195. data/pdf2json-0.52-source/splash/.DS_Store +0 -0
  196. data/pdf2json-0.52-source/splash/Makefile +103 -0
  197. data/pdf2json-0.52-source/splash/Makefile.dep +0 -0
  198. data/pdf2json-0.52-source/splash/Makefile.in +103 -0
  199. data/pdf2json-0.52-source/splash/Splash.cc +3310 -0
  200. data/pdf2json-0.52-source/splash/Splash.h +293 -0
  201. data/pdf2json-0.52-source/splash/Splash.o +0 -0
  202. data/pdf2json-0.52-source/splash/SplashBitmap.cc +188 -0
  203. data/pdf2json-0.52-source/splash/SplashBitmap.h +64 -0
  204. data/pdf2json-0.52-source/splash/SplashBitmap.o +0 -0
  205. data/pdf2json-0.52-source/splash/SplashClip.cc +382 -0
  206. data/pdf2json-0.52-source/splash/SplashClip.h +107 -0
  207. data/pdf2json-0.52-source/splash/SplashClip.o +0 -0
  208. data/pdf2json-0.52-source/splash/SplashErrorCodes.h +32 -0
  209. data/pdf2json-0.52-source/splash/SplashFTFont.cc +357 -0
  210. data/pdf2json-0.52-source/splash/SplashFTFont.h +58 -0
  211. data/pdf2json-0.52-source/splash/SplashFTFont.o +0 -0
  212. data/pdf2json-0.52-source/splash/SplashFTFontEngine.cc +179 -0
  213. data/pdf2json-0.52-source/splash/SplashFTFontEngine.h +65 -0
  214. data/pdf2json-0.52-source/splash/SplashFTFontEngine.o +0 -0
  215. data/pdf2json-0.52-source/splash/SplashFTFontFile.cc +114 -0
  216. data/pdf2json-0.52-source/splash/SplashFTFontFile.h +73 -0
  217. data/pdf2json-0.52-source/splash/SplashFTFontFile.o +0 -0
  218. data/pdf2json-0.52-source/splash/SplashFont.cc +176 -0
  219. data/pdf2json-0.52-source/splash/SplashFont.h +104 -0
  220. data/pdf2json-0.52-source/splash/SplashFont.o +0 -0
  221. data/pdf2json-0.52-source/splash/SplashFontEngine.cc +317 -0
  222. data/pdf2json-0.52-source/splash/SplashFontEngine.h +91 -0
  223. data/pdf2json-0.52-source/splash/SplashFontEngine.o +0 -0
  224. data/pdf2json-0.52-source/splash/SplashFontFile.cc +55 -0
  225. data/pdf2json-0.52-source/splash/SplashFontFile.h +60 -0
  226. data/pdf2json-0.52-source/splash/SplashFontFile.o +0 -0
  227. data/pdf2json-0.52-source/splash/SplashFontFileID.cc +23 -0
  228. data/pdf2json-0.52-source/splash/SplashFontFileID.h +30 -0
  229. data/pdf2json-0.52-source/splash/SplashFontFileID.o +0 -0
  230. data/pdf2json-0.52-source/splash/SplashGlyphBitmap.h +26 -0
  231. data/pdf2json-0.52-source/splash/SplashMath.h +89 -0
  232. data/pdf2json-0.52-source/splash/SplashPath.cc +184 -0
  233. data/pdf2json-0.52-source/splash/SplashPath.h +121 -0
  234. data/pdf2json-0.52-source/splash/SplashPath.o +0 -0
  235. data/pdf2json-0.52-source/splash/SplashPattern.cc +40 -0
  236. data/pdf2json-0.52-source/splash/SplashPattern.h +65 -0
  237. data/pdf2json-0.52-source/splash/SplashPattern.o +0 -0
  238. data/pdf2json-0.52-source/splash/SplashScreen.cc +383 -0
  239. data/pdf2json-0.52-source/splash/SplashScreen.h +56 -0
  240. data/pdf2json-0.52-source/splash/SplashScreen.o +0 -0
  241. data/pdf2json-0.52-source/splash/SplashState.cc +165 -0
  242. data/pdf2json-0.52-source/splash/SplashState.h +103 -0
  243. data/pdf2json-0.52-source/splash/SplashState.o +0 -0
  244. data/pdf2json-0.52-source/splash/SplashT1Font.cc +287 -0
  245. data/pdf2json-0.52-source/splash/SplashT1Font.h +57 -0
  246. data/pdf2json-0.52-source/splash/SplashT1Font.o +0 -0
  247. data/pdf2json-0.52-source/splash/SplashT1FontEngine.cc +124 -0
  248. data/pdf2json-0.52-source/splash/SplashT1FontEngine.h +53 -0
  249. data/pdf2json-0.52-source/splash/SplashT1FontEngine.o +0 -0
  250. data/pdf2json-0.52-source/splash/SplashT1FontFile.cc +97 -0
  251. data/pdf2json-0.52-source/splash/SplashT1FontFile.h +58 -0
  252. data/pdf2json-0.52-source/splash/SplashT1FontFile.o +0 -0
  253. data/pdf2json-0.52-source/splash/SplashTypes.h +132 -0
  254. data/pdf2json-0.52-source/splash/SplashXPath.cc +438 -0
  255. data/pdf2json-0.52-source/splash/SplashXPath.h +100 -0
  256. data/pdf2json-0.52-source/splash/SplashXPath.o +0 -0
  257. data/pdf2json-0.52-source/splash/SplashXPathScanner.cc +428 -0
  258. data/pdf2json-0.52-source/splash/SplashXPathScanner.h +87 -0
  259. data/pdf2json-0.52-source/splash/SplashXPathScanner.o +0 -0
  260. data/pdf2json-0.52-source/splash/libsplash.a +0 -0
  261. data/pdf2json-0.52-source/splash/vms_make.com +0 -0
  262. data/pdf2json-0.52-source/src/.DS_Store +0 -0
  263. data/pdf2json-0.52-source/src/GVector.h +101 -0
  264. data/pdf2json-0.52-source/src/ImgOutputDev.cc +1243 -0
  265. data/pdf2json-0.52-source/src/ImgOutputDev.h +307 -0
  266. data/pdf2json-0.52-source/src/ImgOutputDev.o +0 -0
  267. data/pdf2json-0.52-source/src/Makefile +68 -0
  268. data/pdf2json-0.52-source/src/Makefile.in +68 -0
  269. data/pdf2json-0.52-source/src/XmlFonts.cc +367 -0
  270. data/pdf2json-0.52-source/src/XmlFonts.h +91 -0
  271. data/pdf2json-0.52-source/src/XmlFonts.o +0 -0
  272. data/pdf2json-0.52-source/src/XmlLinks.cc +101 -0
  273. data/pdf2json-0.52-source/src/XmlLinks.h +54 -0
  274. data/pdf2json-0.52-source/src/XmlLinks.o +0 -0
  275. data/pdf2json-0.52-source/src/pdf2json +0 -0
  276. data/pdf2json-0.52-source/src/pdf2json.cc +343 -0
  277. data/pdf2json-0.52-source/src/pdf2json.o +0 -0
  278. data/pdf2json-0.52-source/src/pdf2xml.dtd +22 -0
  279. data/pdf2json-0.52-source/src/pdf2xmljson.dtd +9 -0
  280. data/pdf2json-0.52-source/xpdf/.DS_Store +0 -0
  281. data/pdf2json-0.52-source/xpdf/Annot.cc +1556 -0
  282. data/pdf2json-0.52-source/xpdf/Annot.h +142 -0
  283. data/pdf2json-0.52-source/xpdf/Annot.o +0 -0
  284. data/pdf2json-0.52-source/xpdf/Array.cc +73 -0
  285. data/pdf2json-0.52-source/xpdf/Array.h +58 -0
  286. data/pdf2json-0.52-source/xpdf/Array.o +0 -0
  287. data/pdf2json-0.52-source/xpdf/BuiltinFont.cc +65 -0
  288. data/pdf2json-0.52-source/xpdf/BuiltinFont.h +57 -0
  289. data/pdf2json-0.52-source/xpdf/BuiltinFont.o +0 -0
  290. data/pdf2json-0.52-source/xpdf/BuiltinFontTables.cc +4284 -0
  291. data/pdf2json-0.52-source/xpdf/BuiltinFontTables.h +23 -0
  292. data/pdf2json-0.52-source/xpdf/BuiltinFontTables.o +0 -0
  293. data/pdf2json-0.52-source/xpdf/CMap.cc +408 -0
  294. data/pdf2json-0.52-source/xpdf/CMap.h +102 -0
  295. data/pdf2json-0.52-source/xpdf/CMap.o +0 -0
  296. data/pdf2json-0.52-source/xpdf/Catalog.cc +374 -0
  297. data/pdf2json-0.52-source/xpdf/Catalog.h +97 -0
  298. data/pdf2json-0.52-source/xpdf/Catalog.o +0 -0
  299. data/pdf2json-0.52-source/xpdf/CharCodeToUnicode.cc +540 -0
  300. data/pdf2json-0.52-source/xpdf/CharCodeToUnicode.h +117 -0
  301. data/pdf2json-0.52-source/xpdf/CharCodeToUnicode.o +0 -0
  302. data/pdf2json-0.52-source/xpdf/CharTypes.h +24 -0
  303. data/pdf2json-0.52-source/xpdf/CompactFontTables.h +464 -0
  304. data/pdf2json-0.52-source/xpdf/CoreOutputDev.cc +61 -0
  305. data/pdf2json-0.52-source/xpdf/CoreOutputDev.h +61 -0
  306. data/pdf2json-0.52-source/xpdf/Decrypt.cc +776 -0
  307. data/pdf2json-0.52-source/xpdf/Decrypt.h +95 -0
  308. data/pdf2json-0.52-source/xpdf/Decrypt.o +0 -0
  309. data/pdf2json-0.52-source/xpdf/Dict.cc +95 -0
  310. data/pdf2json-0.52-source/xpdf/Dict.h +77 -0
  311. data/pdf2json-0.52-source/xpdf/Dict.o +0 -0
  312. data/pdf2json-0.52-source/xpdf/Error.cc +38 -0
  313. data/pdf2json-0.52-source/xpdf/Error.h +23 -0
  314. data/pdf2json-0.52-source/xpdf/Error.o +0 -0
  315. data/pdf2json-0.52-source/xpdf/ErrorCodes.h +36 -0
  316. data/pdf2json-0.52-source/xpdf/FontEncodingTables.cc +1824 -0
  317. data/pdf2json-0.52-source/xpdf/FontEncodingTables.h +20 -0
  318. data/pdf2json-0.52-source/xpdf/FontEncodingTables.o +0 -0
  319. data/pdf2json-0.52-source/xpdf/Function.cc +1573 -0
  320. data/pdf2json-0.52-source/xpdf/Function.h +229 -0
  321. data/pdf2json-0.52-source/xpdf/Function.o +0 -0
  322. data/pdf2json-0.52-source/xpdf/Gfx.cc +4187 -0
  323. data/pdf2json-0.52-source/xpdf/Gfx.h +312 -0
  324. data/pdf2json-0.52-source/xpdf/Gfx.o +0 -0
  325. data/pdf2json-0.52-source/xpdf/GfxFont.cc +1568 -0
  326. data/pdf2json-0.52-source/xpdf/GfxFont.h +320 -0
  327. data/pdf2json-0.52-source/xpdf/GfxFont.o +0 -0
  328. data/pdf2json-0.52-source/xpdf/GfxState.cc +4137 -0
  329. data/pdf2json-0.52-source/xpdf/GfxState.h +1244 -0
  330. data/pdf2json-0.52-source/xpdf/GfxState.o +0 -0
  331. data/pdf2json-0.52-source/xpdf/GlobalParams.cc +2924 -0
  332. data/pdf2json-0.52-source/xpdf/GlobalParams.cc.old +2908 -0
  333. data/pdf2json-0.52-source/xpdf/GlobalParams.h +466 -0
  334. data/pdf2json-0.52-source/xpdf/GlobalParams.h.old +463 -0
  335. data/pdf2json-0.52-source/xpdf/GlobalParams.o +0 -0
  336. data/pdf2json-0.52-source/xpdf/ImageOutputDev.cc +195 -0
  337. data/pdf2json-0.52-source/xpdf/ImageOutputDev.h +76 -0
  338. data/pdf2json-0.52-source/xpdf/ImageOutputDev.o +0 -0
  339. data/pdf2json-0.52-source/xpdf/JArithmeticDecoder.cc +322 -0
  340. data/pdf2json-0.52-source/xpdf/JArithmeticDecoder.h +109 -0
  341. data/pdf2json-0.52-source/xpdf/JArithmeticDecoder.o +0 -0
  342. data/pdf2json-0.52-source/xpdf/JBIG2Stream.cc +3413 -0
  343. data/pdf2json-0.52-source/xpdf/JBIG2Stream.h +145 -0
  344. data/pdf2json-0.52-source/xpdf/JBIG2Stream.o +0 -0
  345. data/pdf2json-0.52-source/xpdf/JPXStream.cc +3144 -0
  346. data/pdf2json-0.52-source/xpdf/JPXStream.h +351 -0
  347. data/pdf2json-0.52-source/xpdf/JPXStream.o +0 -0
  348. data/pdf2json-0.52-source/xpdf/Lexer.cc +485 -0
  349. data/pdf2json-0.52-source/xpdf/Lexer.h +80 -0
  350. data/pdf2json-0.52-source/xpdf/Lexer.o +0 -0
  351. data/pdf2json-0.52-source/xpdf/Link.cc +806 -0
  352. data/pdf2json-0.52-source/xpdf/Link.cc.old +784 -0
  353. data/pdf2json-0.52-source/xpdf/Link.h +415 -0
  354. data/pdf2json-0.52-source/xpdf/Link.h.old +369 -0
  355. data/pdf2json-0.52-source/xpdf/Link.o +0 -0
  356. data/pdf2json-0.52-source/xpdf/Makefile +232 -0
  357. data/pdf2json-0.52-source/xpdf/Makefile.dep +0 -0
  358. data/pdf2json-0.52-source/xpdf/Makefile.in +232 -0
  359. data/pdf2json-0.52-source/xpdf/NameToCharCode.cc +116 -0
  360. data/pdf2json-0.52-source/xpdf/NameToCharCode.h +42 -0
  361. data/pdf2json-0.52-source/xpdf/NameToCharCode.o +0 -0
  362. data/pdf2json-0.52-source/xpdf/NameToUnicodeTable.h +1097 -0
  363. data/pdf2json-0.52-source/xpdf/Object.cc +231 -0
  364. data/pdf2json-0.52-source/xpdf/Object.h +303 -0
  365. data/pdf2json-0.52-source/xpdf/Object.o +0 -0
  366. data/pdf2json-0.52-source/xpdf/Outline.cc +151 -0
  367. data/pdf2json-0.52-source/xpdf/Outline.h +76 -0
  368. data/pdf2json-0.52-source/xpdf/Outline.o +0 -0
  369. data/pdf2json-0.52-source/xpdf/OutputDev.cc +131 -0
  370. data/pdf2json-0.52-source/xpdf/OutputDev.h +253 -0
  371. data/pdf2json-0.52-source/xpdf/OutputDev.o +0 -0
  372. data/pdf2json-0.52-source/xpdf/PDFCore.cc +2044 -0
  373. data/pdf2json-0.52-source/xpdf/PDFCore.h +321 -0
  374. data/pdf2json-0.52-source/xpdf/PDFDoc.cc +404 -0
  375. data/pdf2json-0.52-source/xpdf/PDFDoc.h +183 -0
  376. data/pdf2json-0.52-source/xpdf/PDFDoc.o +0 -0
  377. data/pdf2json-0.52-source/xpdf/PDFDocEncoding.cc +44 -0
  378. data/pdf2json-0.52-source/xpdf/PDFDocEncoding.h +16 -0
  379. data/pdf2json-0.52-source/xpdf/PDFDocEncoding.o +0 -0
  380. data/pdf2json-0.52-source/xpdf/PSOutputDev.cc +6224 -0
  381. data/pdf2json-0.52-source/xpdf/PSOutputDev.h +395 -0
  382. data/pdf2json-0.52-source/xpdf/PSOutputDev.o +0 -0
  383. data/pdf2json-0.52-source/xpdf/PSTokenizer.cc +135 -0
  384. data/pdf2json-0.52-source/xpdf/PSTokenizer.h +41 -0
  385. data/pdf2json-0.52-source/xpdf/PSTokenizer.o +0 -0
  386. data/pdf2json-0.52-source/xpdf/Page.cc +454 -0
  387. data/pdf2json-0.52-source/xpdf/Page.h +187 -0
  388. data/pdf2json-0.52-source/xpdf/Page.o +0 -0
  389. data/pdf2json-0.52-source/xpdf/Parser.cc +227 -0
  390. data/pdf2json-0.52-source/xpdf/Parser.h +59 -0
  391. data/pdf2json-0.52-source/xpdf/Parser.o +0 -0
  392. data/pdf2json-0.52-source/xpdf/PreScanOutputDev.cc +257 -0
  393. data/pdf2json-0.52-source/xpdf/PreScanOutputDev.h +130 -0
  394. data/pdf2json-0.52-source/xpdf/PreScanOutputDev.o +0 -0
  395. data/pdf2json-0.52-source/xpdf/SecurityHandler.cc +390 -0
  396. data/pdf2json-0.52-source/xpdf/SecurityHandler.h +160 -0
  397. data/pdf2json-0.52-source/xpdf/SecurityHandler.o +0 -0
  398. data/pdf2json-0.52-source/xpdf/SplashOutputDev.cc +2845 -0
  399. data/pdf2json-0.52-source/xpdf/SplashOutputDev.h +247 -0
  400. data/pdf2json-0.52-source/xpdf/SplashOutputDev.o +0 -0
  401. data/pdf2json-0.52-source/xpdf/Stream-CCITT.h +459 -0
  402. data/pdf2json-0.52-source/xpdf/Stream.cc +4627 -0
  403. data/pdf2json-0.52-source/xpdf/Stream.h +858 -0
  404. data/pdf2json-0.52-source/xpdf/Stream.o +0 -0
  405. data/pdf2json-0.52-source/xpdf/TextOutputDev.cc +4090 -0
  406. data/pdf2json-0.52-source/xpdf/TextOutputDev.h +661 -0
  407. data/pdf2json-0.52-source/xpdf/TextOutputDev.o +0 -0
  408. data/pdf2json-0.52-source/xpdf/UTF8.h +56 -0
  409. data/pdf2json-0.52-source/xpdf/UnicodeMap.cc +302 -0
  410. data/pdf2json-0.52-source/xpdf/UnicodeMap.cc.old +293 -0
  411. data/pdf2json-0.52-source/xpdf/UnicodeMap.h +135 -0
  412. data/pdf2json-0.52-source/xpdf/UnicodeMap.h.old +123 -0
  413. data/pdf2json-0.52-source/xpdf/UnicodeMap.o +0 -0
  414. data/pdf2json-0.52-source/xpdf/UnicodeMapTables.h +361 -0
  415. data/pdf2json-0.52-source/xpdf/UnicodeTypeTable.cc +949 -0
  416. data/pdf2json-0.52-source/xpdf/UnicodeTypeTable.h +20 -0
  417. data/pdf2json-0.52-source/xpdf/UnicodeTypeTable.o +0 -0
  418. data/pdf2json-0.52-source/xpdf/XPDFApp.cc +447 -0
  419. data/pdf2json-0.52-source/xpdf/XPDFApp.h +114 -0
  420. data/pdf2json-0.52-source/xpdf/XPDFCore.cc +1655 -0
  421. data/pdf2json-0.52-source/xpdf/XPDFCore.h +251 -0
  422. data/pdf2json-0.52-source/xpdf/XPDFTree.cc +931 -0
  423. data/pdf2json-0.52-source/xpdf/XPDFTree.h +45 -0
  424. data/pdf2json-0.52-source/xpdf/XPDFTreeP.h +87 -0
  425. data/pdf2json-0.52-source/xpdf/XPDFViewer.cc +3488 -0
  426. data/pdf2json-0.52-source/xpdf/XPDFViewer.h +352 -0
  427. data/pdf2json-0.52-source/xpdf/XRef.cc +896 -0
  428. data/pdf2json-0.52-source/xpdf/XRef.h +133 -0
  429. data/pdf2json-0.52-source/xpdf/XRef.o +0 -0
  430. data/pdf2json-0.52-source/xpdf/XpdfPluginAPI.cc +262 -0
  431. data/pdf2json-0.52-source/xpdf/XpdfPluginAPI.h +341 -0
  432. data/pdf2json-0.52-source/xpdf/XpdfPluginAPI.o +0 -0
  433. data/pdf2json-0.52-source/xpdf/about-text.h +48 -0
  434. data/pdf2json-0.52-source/xpdf/about.xbm +6 -0
  435. data/pdf2json-0.52-source/xpdf/backArrow.xbm +6 -0
  436. data/pdf2json-0.52-source/xpdf/backArrowDis.xbm +6 -0
  437. data/pdf2json-0.52-source/xpdf/config.h +112 -0
  438. data/pdf2json-0.52-source/xpdf/dblLeftArrow.xbm +6 -0
  439. data/pdf2json-0.52-source/xpdf/dblLeftArrowDis.xbm +6 -0
  440. data/pdf2json-0.52-source/xpdf/dblRightArrow.xbm +6 -0
  441. data/pdf2json-0.52-source/xpdf/dblRightArrowDis.xbm +6 -0
  442. data/pdf2json-0.52-source/xpdf/find.xbm +6 -0
  443. data/pdf2json-0.52-source/xpdf/findDis.xbm +6 -0
  444. data/pdf2json-0.52-source/xpdf/forwardArrow.xbm +6 -0
  445. data/pdf2json-0.52-source/xpdf/forwardArrowDis.xbm +6 -0
  446. data/pdf2json-0.52-source/xpdf/leftArrow.xbm +5 -0
  447. data/pdf2json-0.52-source/xpdf/leftArrowDis.xbm +5 -0
  448. data/pdf2json-0.52-source/xpdf/libXpdf.a +0 -0
  449. data/pdf2json-0.52-source/xpdf/pdffonts +0 -0
  450. data/pdf2json-0.52-source/xpdf/pdffonts.cc +298 -0
  451. data/pdf2json-0.52-source/xpdf/pdffonts.o +0 -0
  452. data/pdf2json-0.52-source/xpdf/pdfimages +0 -0
  453. data/pdf2json-0.52-source/xpdf/pdfimages.cc +155 -0
  454. data/pdf2json-0.52-source/xpdf/pdfimages.o +0 -0
  455. data/pdf2json-0.52-source/xpdf/pdfinfo +0 -0
  456. data/pdf2json-0.52-source/xpdf/pdfinfo.cc +387 -0
  457. data/pdf2json-0.52-source/xpdf/pdfinfo.o +0 -0
  458. data/pdf2json-0.52-source/xpdf/pdftoppm.cc +203 -0
  459. data/pdf2json-0.52-source/xpdf/pdftops +0 -0
  460. data/pdf2json-0.52-source/xpdf/pdftops.cc +344 -0
  461. data/pdf2json-0.52-source/xpdf/pdftops.o +0 -0
  462. data/pdf2json-0.52-source/xpdf/pdftotext +0 -0
  463. data/pdf2json-0.52-source/xpdf/pdftotext.cc +333 -0
  464. data/pdf2json-0.52-source/xpdf/pdftotext.o +0 -0
  465. data/pdf2json-0.52-source/xpdf/print.xbm +6 -0
  466. data/pdf2json-0.52-source/xpdf/printDis.xbm +6 -0
  467. data/pdf2json-0.52-source/xpdf/rightArrow.xbm +5 -0
  468. data/pdf2json-0.52-source/xpdf/rightArrowDis.xbm +5 -0
  469. data/pdf2json-0.52-source/xpdf/vms_make.com +129 -0
  470. data/pdf2json-0.52-source/xpdf/xpdf.cc +344 -0
  471. data/pdf2json-0.52-source/xpdf/xpdfIcon.xpm +62 -0
  472. data/pdf2json.gemspec +29 -0
  473. metadata +518 -0
Binary file
@@ -0,0 +1,4090 @@
1
+ //========================================================================
2
+ //
3
+ // TextOutputDev.cc
4
+ //
5
+ // Copyright 1997-2003 Glyph & Cog, LLC
6
+ //
7
+ //========================================================================
8
+
9
+ #include <aconf.h>
10
+
11
+ #ifdef USE_GCC_PRAGMAS
12
+ #pragma implementation
13
+ #endif
14
+
15
+ #include <stdio.h>
16
+ #include <stdlib.h>
17
+ #include <stddef.h>
18
+ #include <math.h>
19
+ #include <ctype.h>
20
+ #ifdef WIN32
21
+ #include <fcntl.h> // for O_BINARY
22
+ #include <io.h> // for setmode
23
+ #endif
24
+ #include "gmem.h"
25
+ #include "GString.h"
26
+ #include "GList.h"
27
+ #include "config.h"
28
+ #include "Error.h"
29
+ #include "GlobalParams.h"
30
+ #include "UnicodeMap.h"
31
+ #include "UnicodeTypeTable.h"
32
+ #include "GfxState.h"
33
+ #include "Link.h"
34
+ #include "TextOutputDev.h"
35
+
36
+ #ifdef MACOS
37
+ // needed for setting type/creator of MacOS files
38
+ #include "ICSupport.h"
39
+ #endif
40
+
41
+ //------------------------------------------------------------------------
42
+ // parameters
43
+ //------------------------------------------------------------------------
44
+
45
+ // Each bucket in a text pool includes baselines within a range of
46
+ // this many points.
47
+ #define textPoolStep 4
48
+
49
+ // Inter-character space width which will cause addChar to start a new
50
+ // word.
51
+ #define minWordBreakSpace 0.1
52
+
53
+ // Negative inter-character space width, i.e., overlap, which will
54
+ // cause addChar to start a new word.
55
+ #define minDupBreakOverlap 0.2
56
+
57
+ // Max distance between baselines of two lines within a block, as a
58
+ // fraction of the font size.
59
+ #define maxLineSpacingDelta 1.5
60
+
61
+ // Max difference in primary font sizes on two lines in the same
62
+ // block. Delta1 is used when examining new lines above and below the
63
+ // current block; delta2 is used when examining text that overlaps the
64
+ // current block; delta3 is used when examining text to the left and
65
+ // right of the current block.
66
+ #define maxBlockFontSizeDelta1 0.05
67
+ #define maxBlockFontSizeDelta2 0.6
68
+ #define maxBlockFontSizeDelta3 0.2
69
+
70
+ // Max difference in font sizes inside a word.
71
+ #define maxWordFontSizeDelta 0.05
72
+
73
+ // Maximum distance between baselines of two words on the same line,
74
+ // e.g., distance between subscript or superscript and the primary
75
+ // baseline, as a fraction of the font size.
76
+ #define maxIntraLineDelta 0.5
77
+
78
+ // Minimum inter-word spacing, as a fraction of the font size. (Only
79
+ // used for raw ordering.)
80
+ #define minWordSpacing 0.15
81
+
82
+ // Maximum inter-word spacing, as a fraction of the font size.
83
+ #define maxWordSpacing 1.5
84
+
85
+ // Maximum horizontal spacing which will allow a word to be pulled
86
+ // into a block.
87
+ #define minColSpacing1 0.3
88
+
89
+ // Minimum spacing between columns, as a fraction of the font size.
90
+ #define minColSpacing2 1.0
91
+
92
+ // Maximum vertical spacing between blocks within a flow, as a
93
+ // multiple of the font size.
94
+ #define maxBlockSpacing 2.5
95
+
96
+ // Minimum spacing between characters within a word, as a fraction of
97
+ // the font size.
98
+ #define minCharSpacing -0.2
99
+
100
+ // Maximum spacing between characters within a word, as a fraction of
101
+ // the font size, when there is no obvious extra-wide character
102
+ // spacing.
103
+ #define maxCharSpacing 0.03
104
+
105
+ // When extra-wide character spacing is detected, the inter-character
106
+ // space threshold is set to the minimum inter-character space
107
+ // multiplied by this constant.
108
+ #define maxWideCharSpacingMul 1.3
109
+
110
+ // Upper limit on spacing between characters in a word.
111
+ #define maxWideCharSpacing 0.4
112
+
113
+ // Max difference in primary,secondary coordinates (as a fraction of
114
+ // the font size) allowed for duplicated text (fake boldface, drop
115
+ // shadows) which is to be discarded.
116
+ #define dupMaxPriDelta 0.1
117
+ #define dupMaxSecDelta 0.2
118
+
119
+ // Max width of underlines (in points).
120
+ #define maxUnderlineWidth 3
121
+
122
+ // Min distance between baseline and underline (in points).
123
+ //~ this should be font-size-dependent
124
+ #define minUnderlineGap -2
125
+
126
+ // Max distance between baseline and underline (in points).
127
+ //~ this should be font-size-dependent
128
+ #define maxUnderlineGap 4
129
+
130
+ // Max horizontal distance between edge of word and start of underline
131
+ // (in points).
132
+ //~ this should be font-size-dependent
133
+ #define underlineSlack 1
134
+
135
+ // Max distance between edge of text and edge of link border
136
+ #define hyperlinkSlack 2
137
+
138
+ //------------------------------------------------------------------------
139
+ // TextUnderline
140
+ //------------------------------------------------------------------------
141
+
142
+ class TextUnderline {
143
+ public:
144
+
145
+ TextUnderline(double x0A, double y0A, double x1A, double y1A)
146
+ { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
147
+ ~TextUnderline() {}
148
+
149
+ double x0, y0, x1, y1;
150
+ GBool horiz;
151
+ };
152
+
153
+ //------------------------------------------------------------------------
154
+ // TextLink
155
+ //------------------------------------------------------------------------
156
+
157
+ class TextLink {
158
+ public:
159
+
160
+ TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, Link *linkA)
161
+ { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; link = linkA; }
162
+ ~TextLink() {}
163
+
164
+ int xMin, yMin, xMax, yMax;
165
+ Link *link;
166
+ };
167
+
168
+ //------------------------------------------------------------------------
169
+ // TextFontInfo
170
+ //------------------------------------------------------------------------
171
+
172
+ TextFontInfo::TextFontInfo(GfxState *state) {
173
+ gfxFont = state->getFont();
174
+ #if TEXTOUT_WORD_LIST
175
+ fontName = (gfxFont && gfxFont->getOrigName())
176
+ ? gfxFont->getOrigName()->copy()
177
+ : (GString *)NULL;
178
+ flags = gfxFont ? gfxFont->getFlags() : 0;
179
+ #endif
180
+ }
181
+
182
+ TextFontInfo::~TextFontInfo() {
183
+ #if TEXTOUT_WORD_LIST
184
+ if (fontName) {
185
+ delete fontName;
186
+ }
187
+ #endif
188
+ }
189
+
190
+ GBool TextFontInfo::matches(GfxState *state) {
191
+ return state->getFont() == gfxFont;
192
+ }
193
+
194
+ //------------------------------------------------------------------------
195
+ // TextWord
196
+ //------------------------------------------------------------------------
197
+
198
+ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
199
+ int charPosA, TextFontInfo *fontA, double fontSizeA) {
200
+ GfxFont *gfxFont;
201
+ double x, y, ascent, descent;
202
+
203
+ rot = rotA;
204
+ charPos = charPosA;
205
+ charLen = 0;
206
+ font = fontA;
207
+ fontSize = fontSizeA;
208
+ state->transform(x0, y0, &x, &y);
209
+ if ((gfxFont = font->gfxFont)) {
210
+ ascent = gfxFont->getAscent() * fontSize;
211
+ descent = gfxFont->getDescent() * fontSize;
212
+ } else {
213
+ // this means that the PDF file draws text without a current font,
214
+ // which should never happen
215
+ ascent = 0.95 * fontSize;
216
+ descent = -0.35 * fontSize;
217
+ }
218
+ switch (rot) {
219
+ case 0:
220
+ yMin = y - ascent;
221
+ yMax = y - descent;
222
+ if (yMin == yMax) {
223
+ // this is a sanity check for a case that shouldn't happen -- but
224
+ // if it does happen, we want to avoid dividing by zero later
225
+ yMin = y;
226
+ yMax = y + 1;
227
+ }
228
+ base = y;
229
+ break;
230
+ case 1:
231
+ xMin = x + descent;
232
+ xMax = x + ascent;
233
+ if (xMin == xMax) {
234
+ // this is a sanity check for a case that shouldn't happen -- but
235
+ // if it does happen, we want to avoid dividing by zero later
236
+ xMin = x;
237
+ xMax = x + 1;
238
+ }
239
+ base = x;
240
+ break;
241
+ case 2:
242
+ yMin = y + descent;
243
+ yMax = y + ascent;
244
+ if (yMin == yMax) {
245
+ // this is a sanity check for a case that shouldn't happen -- but
246
+ // if it does happen, we want to avoid dividing by zero later
247
+ yMin = y;
248
+ yMax = y + 1;
249
+ }
250
+ base = y;
251
+ break;
252
+ case 3:
253
+ xMin = x - ascent;
254
+ xMax = x - descent;
255
+ if (xMin == xMax) {
256
+ // this is a sanity check for a case that shouldn't happen -- but
257
+ // if it does happen, we want to avoid dividing by zero later
258
+ xMin = x;
259
+ xMax = x + 1;
260
+ }
261
+ base = x;
262
+ break;
263
+ }
264
+ text = NULL;
265
+ edge = NULL;
266
+ len = size = 0;
267
+ spaceAfter = gFalse;
268
+ next = NULL;
269
+
270
+ #if TEXTOUT_WORD_LIST
271
+ GfxRGB rgb;
272
+
273
+ if ((state->getRender() & 3) == 1) {
274
+ state->getStrokeRGB(&rgb);
275
+ } else {
276
+ state->getFillRGB(&rgb);
277
+ }
278
+ colorR = colToDbl(rgb.r);
279
+ colorG = colToDbl(rgb.g);
280
+ colorB = colToDbl(rgb.b);
281
+ #endif
282
+
283
+ underlined = gFalse;
284
+ link = NULL;
285
+ }
286
+
287
+ TextWord::~TextWord() {
288
+ gfree(text);
289
+ gfree(edge);
290
+ }
291
+
292
+ void TextWord::addChar(GfxState *state, double x, double y,
293
+ double dx, double dy, Unicode u) {
294
+ if (len == size) {
295
+ size += 16;
296
+ text = (Unicode *)greallocn(text, size, sizeof(Unicode));
297
+ edge = (double *)greallocn(edge, size + 1, sizeof(double));
298
+ }
299
+ text[len] = u;
300
+ switch (rot) {
301
+ case 0:
302
+ if (len == 0) {
303
+ xMin = x;
304
+ }
305
+ edge[len] = x;
306
+ xMax = edge[len+1] = x + dx;
307
+ break;
308
+ case 1:
309
+ if (len == 0) {
310
+ yMin = y;
311
+ }
312
+ edge[len] = y;
313
+ yMax = edge[len+1] = y + dy;
314
+ break;
315
+ case 2:
316
+ if (len == 0) {
317
+ xMax = x;
318
+ }
319
+ edge[len] = x;
320
+ xMin = edge[len+1] = x + dx;
321
+ break;
322
+ case 3:
323
+ if (len == 0) {
324
+ yMax = y;
325
+ }
326
+ edge[len] = y;
327
+ yMin = edge[len+1] = y + dy;
328
+ break;
329
+ }
330
+ ++len;
331
+ }
332
+
333
+ void TextWord::merge(TextWord *word) {
334
+ int i;
335
+
336
+ if (word->xMin < xMin) {
337
+ xMin = word->xMin;
338
+ }
339
+ if (word->yMin < yMin) {
340
+ yMin = word->yMin;
341
+ }
342
+ if (word->xMax > xMax) {
343
+ xMax = word->xMax;
344
+ }
345
+ if (word->yMax > yMax) {
346
+ yMax = word->yMax;
347
+ }
348
+ if (len + word->len > size) {
349
+ size = len + word->len;
350
+ text = (Unicode *)greallocn(text, size, sizeof(Unicode));
351
+ edge = (double *)greallocn(edge, size + 1, sizeof(double));
352
+ }
353
+ for (i = 0; i < word->len; ++i) {
354
+ text[len + i] = word->text[i];
355
+ edge[len + i] = word->edge[i];
356
+ }
357
+ edge[len + word->len] = word->edge[word->len];
358
+ len += word->len;
359
+ charLen += word->charLen;
360
+ }
361
+
362
+ inline int TextWord::primaryCmp(TextWord *word) {
363
+ double cmp;
364
+
365
+ cmp = 0; // make gcc happy
366
+ switch (rot) {
367
+ case 0:
368
+ cmp = xMin - word->xMin;
369
+ break;
370
+ case 1:
371
+ cmp = yMin - word->yMin;
372
+ break;
373
+ case 2:
374
+ cmp = word->xMax - xMax;
375
+ break;
376
+ case 3:
377
+ cmp = word->yMax - yMax;
378
+ break;
379
+ }
380
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
381
+ }
382
+
383
+ double TextWord::primaryDelta(TextWord *word) {
384
+ double delta;
385
+
386
+ delta = 0; // make gcc happy
387
+ switch (rot) {
388
+ case 0:
389
+ delta = word->xMin - xMax;
390
+ break;
391
+ case 1:
392
+ delta = word->yMin - yMax;
393
+ break;
394
+ case 2:
395
+ delta = xMin - word->xMax;
396
+ break;
397
+ case 3:
398
+ delta = yMin - word->yMax;
399
+ break;
400
+ }
401
+ return delta;
402
+ }
403
+
404
+ int TextWord::cmpYX(const void *p1, const void *p2) {
405
+ TextWord *word1 = *(TextWord **)p1;
406
+ TextWord *word2 = *(TextWord **)p2;
407
+ double cmp;
408
+
409
+ cmp = word1->yMin - word2->yMin;
410
+ if (cmp == 0) {
411
+ cmp = word1->xMin - word2->xMin;
412
+ }
413
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
414
+ }
415
+
416
+ #if TEXTOUT_WORD_LIST
417
+
418
+ GString *TextWord::getText() {
419
+ GString *s;
420
+ UnicodeMap *uMap;
421
+ char buf[8];
422
+ int n, i;
423
+
424
+ s = new GString();
425
+ if (!(uMap = globalParams->getTextEncoding())) {
426
+ return s;
427
+ }
428
+ for (i = 0; i < len; ++i) {
429
+ n = uMap->mapUnicode(text[i], buf, sizeof(buf));
430
+ s->append(buf, n);
431
+ }
432
+ uMap->decRefCnt();
433
+ return s;
434
+ }
435
+
436
+ void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
437
+ double *xMaxA, double *yMaxA) {
438
+ if (charIdx < 0 || charIdx >= len) {
439
+ return;
440
+ }
441
+ switch (rot) {
442
+ case 0:
443
+ *xMinA = edge[charIdx];
444
+ *xMaxA = edge[charIdx + 1];
445
+ *yMinA = yMin;
446
+ *yMaxA = yMax;
447
+ break;
448
+ case 1:
449
+ *xMinA = xMin;
450
+ *xMaxA = xMax;
451
+ *yMinA = edge[charIdx];
452
+ *yMaxA = edge[charIdx + 1];
453
+ break;
454
+ case 2:
455
+ *xMinA = edge[charIdx + 1];
456
+ *xMaxA = edge[charIdx];
457
+ *yMinA = yMin;
458
+ *yMaxA = yMax;
459
+ break;
460
+ case 3:
461
+ *xMinA = xMin;
462
+ *xMaxA = xMax;
463
+ *yMinA = edge[charIdx + 1];
464
+ *yMaxA = edge[charIdx];
465
+ break;
466
+ }
467
+ }
468
+
469
+ #endif // TEXTOUT_WORD_LIST
470
+
471
+ //------------------------------------------------------------------------
472
+ // TextPool
473
+ //------------------------------------------------------------------------
474
+
475
+ TextPool::TextPool() {
476
+ minBaseIdx = 0;
477
+ maxBaseIdx = -1;
478
+ pool = NULL;
479
+ cursor = NULL;
480
+ cursorBaseIdx = -1;
481
+ }
482
+
483
+ TextPool::~TextPool() {
484
+ int baseIdx;
485
+ TextWord *word, *word2;
486
+
487
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
488
+ for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
489
+ word2 = word->next;
490
+ delete word;
491
+ }
492
+ }
493
+ gfree(pool);
494
+ }
495
+
496
+ int TextPool::getBaseIdx(double base) {
497
+ int baseIdx;
498
+
499
+ baseIdx = (int)(base / textPoolStep);
500
+ if (baseIdx < minBaseIdx) {
501
+ return minBaseIdx;
502
+ }
503
+ if (baseIdx > maxBaseIdx) {
504
+ return maxBaseIdx;
505
+ }
506
+ return baseIdx;
507
+ }
508
+
509
+ void TextPool::addWord(TextWord *word) {
510
+ TextWord **newPool;
511
+ int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
512
+ TextWord *w0, *w1;
513
+
514
+ // expand the array if needed
515
+ wordBaseIdx = (int)(word->base / textPoolStep);
516
+ if (minBaseIdx > maxBaseIdx) {
517
+ minBaseIdx = wordBaseIdx - 128;
518
+ maxBaseIdx = wordBaseIdx + 128;
519
+ pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1,
520
+ sizeof(TextWord *));
521
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
522
+ pool[baseIdx - minBaseIdx] = NULL;
523
+ }
524
+ } else if (wordBaseIdx < minBaseIdx) {
525
+ newMinBaseIdx = wordBaseIdx - 128;
526
+ newPool = (TextWord **)gmallocn(maxBaseIdx - newMinBaseIdx + 1,
527
+ sizeof(TextWord *));
528
+ for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
529
+ newPool[baseIdx - newMinBaseIdx] = NULL;
530
+ }
531
+ memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
532
+ (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
533
+ gfree(pool);
534
+ pool = newPool;
535
+ minBaseIdx = newMinBaseIdx;
536
+ } else if (wordBaseIdx > maxBaseIdx) {
537
+ newMaxBaseIdx = wordBaseIdx + 128;
538
+ pool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1,
539
+ sizeof(TextWord *));
540
+ for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
541
+ pool[baseIdx - minBaseIdx] = NULL;
542
+ }
543
+ maxBaseIdx = newMaxBaseIdx;
544
+ }
545
+
546
+ // insert the new word
547
+ if (cursor && wordBaseIdx == cursorBaseIdx &&
548
+ word->primaryCmp(cursor) > 0) {
549
+ w0 = cursor;
550
+ w1 = cursor->next;
551
+ } else {
552
+ w0 = NULL;
553
+ w1 = pool[wordBaseIdx - minBaseIdx];
554
+ }
555
+ for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
556
+ word->next = w1;
557
+ if (w0) {
558
+ w0->next = word;
559
+ } else {
560
+ pool[wordBaseIdx - minBaseIdx] = word;
561
+ }
562
+ cursor = word;
563
+ cursorBaseIdx = wordBaseIdx;
564
+ }
565
+
566
+ //------------------------------------------------------------------------
567
+ // TextLine
568
+ //------------------------------------------------------------------------
569
+
570
+ TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
571
+ blk = blkA;
572
+ rot = rotA;
573
+ xMin = yMin = 0;
574
+ xMax = yMax = -1;
575
+ base = baseA;
576
+ words = lastWord = NULL;
577
+ text = NULL;
578
+ edge = NULL;
579
+ col = NULL;
580
+ len = 0;
581
+ convertedLen = 0;
582
+ hyphenated = gFalse;
583
+ next = NULL;
584
+ }
585
+
586
+ TextLine::~TextLine() {
587
+ TextWord *word;
588
+
589
+ while (words) {
590
+ word = words;
591
+ words = words->next;
592
+ delete word;
593
+ }
594
+ gfree(text);
595
+ gfree(edge);
596
+ gfree(col);
597
+ }
598
+
599
+ void TextLine::addWord(TextWord *word) {
600
+ if (lastWord) {
601
+ lastWord->next = word;
602
+ } else {
603
+ words = word;
604
+ }
605
+ lastWord = word;
606
+
607
+ if (xMin > xMax) {
608
+ xMin = word->xMin;
609
+ xMax = word->xMax;
610
+ yMin = word->yMin;
611
+ yMax = word->yMax;
612
+ } else {
613
+ if (word->xMin < xMin) {
614
+ xMin = word->xMin;
615
+ }
616
+ if (word->xMax > xMax) {
617
+ xMax = word->xMax;
618
+ }
619
+ if (word->yMin < yMin) {
620
+ yMin = word->yMin;
621
+ }
622
+ if (word->yMax > yMax) {
623
+ yMax = word->yMax;
624
+ }
625
+ }
626
+ }
627
+
628
+ double TextLine::primaryDelta(TextLine *line) {
629
+ double delta;
630
+
631
+ delta = 0; // make gcc happy
632
+ switch (rot) {
633
+ case 0:
634
+ delta = line->xMin - xMax;
635
+ break;
636
+ case 1:
637
+ delta = line->yMin - yMax;
638
+ break;
639
+ case 2:
640
+ delta = xMin - line->xMax;
641
+ break;
642
+ case 3:
643
+ delta = yMin - line->yMax;
644
+ break;
645
+ }
646
+ return delta;
647
+ }
648
+
649
+ int TextLine::primaryCmp(TextLine *line) {
650
+ double cmp;
651
+
652
+ cmp = 0; // make gcc happy
653
+ switch (rot) {
654
+ case 0:
655
+ cmp = xMin - line->xMin;
656
+ break;
657
+ case 1:
658
+ cmp = yMin - line->yMin;
659
+ break;
660
+ case 2:
661
+ cmp = line->xMax - xMax;
662
+ break;
663
+ case 3:
664
+ cmp = line->yMax - yMax;
665
+ break;
666
+ }
667
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
668
+ }
669
+
670
+ int TextLine::secondaryCmp(TextLine *line) {
671
+ double cmp;
672
+
673
+ cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
674
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
675
+ }
676
+
677
+ int TextLine::cmpYX(TextLine *line) {
678
+ int cmp;
679
+
680
+ if ((cmp = secondaryCmp(line))) {
681
+ return cmp;
682
+ }
683
+ return primaryCmp(line);
684
+ }
685
+
686
+ int TextLine::cmpXY(const void *p1, const void *p2) {
687
+ TextLine *line1 = *(TextLine **)p1;
688
+ TextLine *line2 = *(TextLine **)p2;
689
+ int cmp;
690
+
691
+ if ((cmp = line1->primaryCmp(line2))) {
692
+ return cmp;
693
+ }
694
+ return line1->secondaryCmp(line2);
695
+ }
696
+
697
+ void TextLine::coalesce(UnicodeMap *uMap) {
698
+ TextWord *word0, *word1;
699
+ double space, delta, minSpace;
700
+ GBool isUnicode;
701
+ char buf[8];
702
+ int i, j;
703
+
704
+ if (words->next) {
705
+
706
+ // compute the inter-word space threshold
707
+ if (words->len > 1 || words->next->len > 1) {
708
+ minSpace = 0;
709
+ } else {
710
+ minSpace = words->primaryDelta(words->next);
711
+ for (word0 = words->next, word1 = word0->next;
712
+ word1 && minSpace > 0;
713
+ word0 = word1, word1 = word0->next) {
714
+ if (word1->len > 1) {
715
+ minSpace = 0;
716
+ }
717
+ delta = word0->primaryDelta(word1);
718
+ if (delta < minSpace) {
719
+ minSpace = delta;
720
+ }
721
+ }
722
+ }
723
+ if (minSpace <= 0) {
724
+ space = maxCharSpacing * words->fontSize;
725
+ } else {
726
+ space = maxWideCharSpacingMul * minSpace;
727
+ if (space > maxWideCharSpacing * words->fontSize) {
728
+ space = maxWideCharSpacing * words->fontSize;
729
+ }
730
+ }
731
+
732
+ // merge words
733
+ word0 = words;
734
+ word1 = words->next;
735
+ while (word1) {
736
+ if (word0->primaryDelta(word1) >= space) {
737
+ word0->spaceAfter = gTrue;
738
+ word0 = word1;
739
+ word1 = word1->next;
740
+ } else if (word0->font == word1->font &&
741
+ word0->underlined == word1->underlined &&
742
+ fabs(word0->fontSize - word1->fontSize) <
743
+ maxWordFontSizeDelta * words->fontSize &&
744
+ word1->charPos == word0->charPos + word0->charLen) {
745
+ word0->merge(word1);
746
+ word0->next = word1->next;
747
+ delete word1;
748
+ word1 = word0->next;
749
+ } else {
750
+ word0 = word1;
751
+ word1 = word1->next;
752
+ }
753
+ }
754
+ }
755
+
756
+ // build the line text
757
+ isUnicode = uMap ? uMap->isUnicode() : gFalse;
758
+ len = 0;
759
+ for (word1 = words; word1; word1 = word1->next) {
760
+ len += word1->len;
761
+ if (word1->spaceAfter) {
762
+ ++len;
763
+ }
764
+ }
765
+ text = (Unicode *)gmallocn(len, sizeof(Unicode));
766
+ edge = (double *)gmallocn(len + 1, sizeof(double));
767
+ i = 0;
768
+ for (word1 = words; word1; word1 = word1->next) {
769
+ for (j = 0; j < word1->len; ++j) {
770
+ text[i] = word1->text[j];
771
+ edge[i] = word1->edge[j];
772
+ ++i;
773
+ }
774
+ edge[i] = word1->edge[word1->len];
775
+ if (word1->spaceAfter) {
776
+ text[i] = (Unicode)0x0020;
777
+ ++i;
778
+ }
779
+ }
780
+
781
+ // compute convertedLen and set up the col array
782
+ col = (int *)gmallocn(len + 1, sizeof(int));
783
+ convertedLen = 0;
784
+ for (i = 0; i < len; ++i) {
785
+ col[i] = convertedLen;
786
+ if (isUnicode) {
787
+ ++convertedLen;
788
+ } else if (uMap) {
789
+ convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
790
+ }
791
+ }
792
+ col[len] = convertedLen;
793
+
794
+ // check for hyphen at end of line
795
+ //~ need to check for other chars used as hyphens
796
+ hyphenated = text[len - 1] == (Unicode)'-';
797
+ }
798
+
799
+ //------------------------------------------------------------------------
800
+ // TextLineFrag
801
+ //------------------------------------------------------------------------
802
+
803
+ class TextLineFrag {
804
+ public:
805
+
806
+ TextLine *line; // the line object
807
+ int start, len; // offset and length of this fragment
808
+ // (in Unicode chars)
809
+ double xMin, xMax; // bounding box coordinates
810
+ double yMin, yMax;
811
+ double base; // baseline virtual coordinate
812
+ int col; // first column
813
+
814
+ void init(TextLine *lineA, int startA, int lenA);
815
+ void computeCoords(GBool oneRot);
816
+
817
+ static int cmpYXPrimaryRot(const void *p1, const void *p2);
818
+ static int cmpYXLineRot(const void *p1, const void *p2);
819
+ static int cmpXYLineRot(const void *p1, const void *p2);
820
+ static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
821
+ static int cmpXYColumnLineRot(const void *p1, const void *p2);
822
+ };
823
+
824
+ void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
825
+ line = lineA;
826
+ start = startA;
827
+ len = lenA;
828
+ col = line->col[start];
829
+ }
830
+
831
+ void TextLineFrag::computeCoords(GBool oneRot) {
832
+ TextBlock *blk;
833
+ double d0, d1, d2, d3, d4;
834
+
835
+ if (oneRot) {
836
+
837
+ switch (line->rot) {
838
+ case 0:
839
+ xMin = line->edge[start];
840
+ xMax = line->edge[start + len];
841
+ yMin = line->yMin;
842
+ yMax = line->yMax;
843
+ break;
844
+ case 1:
845
+ xMin = line->xMin;
846
+ xMax = line->xMax;
847
+ yMin = line->edge[start];
848
+ yMax = line->edge[start + len];
849
+ break;
850
+ case 2:
851
+ xMin = line->edge[start + len];
852
+ xMax = line->edge[start];
853
+ yMin = line->yMin;
854
+ yMax = line->yMax;
855
+ break;
856
+ case 3:
857
+ xMin = line->xMin;
858
+ xMax = line->xMax;
859
+ yMin = line->edge[start + len];
860
+ yMax = line->edge[start];
861
+ break;
862
+ }
863
+ base = line->base;
864
+
865
+ } else {
866
+
867
+ if (line->rot == 0 && line->blk->page->primaryRot == 0) {
868
+
869
+ xMin = line->edge[start];
870
+ xMax = line->edge[start + len];
871
+ yMin = line->yMin;
872
+ yMax = line->yMax;
873
+ base = line->base;
874
+
875
+ } else {
876
+
877
+ blk = line->blk;
878
+ d0 = line->edge[start];
879
+ d1 = line->edge[start + len];
880
+ d2 = d3 = d4 = 0; // make gcc happy
881
+
882
+ switch (line->rot) {
883
+ case 0:
884
+ d2 = line->yMin;
885
+ d3 = line->yMax;
886
+ d4 = line->base;
887
+ d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
888
+ d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
889
+ d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
890
+ d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
891
+ d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
892
+ break;
893
+ case 1:
894
+ d2 = line->xMax;
895
+ d3 = line->xMin;
896
+ d4 = line->base;
897
+ d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
898
+ d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
899
+ d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
900
+ d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
901
+ d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
902
+ break;
903
+ case 2:
904
+ d2 = line->yMax;
905
+ d3 = line->yMin;
906
+ d4 = line->base;
907
+ d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
908
+ d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
909
+ d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
910
+ d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
911
+ d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
912
+ break;
913
+ case 3:
914
+ d2 = line->xMin;
915
+ d3 = line->xMax;
916
+ d4 = line->base;
917
+ d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
918
+ d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
919
+ d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
920
+ d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
921
+ d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
922
+ break;
923
+ }
924
+
925
+ switch (line->blk->page->primaryRot) {
926
+ case 0:
927
+ xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
928
+ xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
929
+ yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
930
+ yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
931
+ base = blk->yMin + base * (blk->yMax - blk->yMin);
932
+ break;
933
+ case 1:
934
+ xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
935
+ xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
936
+ yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
937
+ yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
938
+ base = blk->xMax - d4 * (blk->xMax - blk->xMin);
939
+ break;
940
+ case 2:
941
+ xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
942
+ xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
943
+ yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
944
+ yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
945
+ base = blk->yMax - d4 * (blk->yMax - blk->yMin);
946
+ break;
947
+ case 3:
948
+ xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
949
+ xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
950
+ yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
951
+ yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
952
+ base = blk->xMin + d4 * (blk->xMax - blk->xMin);
953
+ break;
954
+ }
955
+
956
+ }
957
+ }
958
+ }
959
+
960
+ int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
961
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
962
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
963
+ double cmp;
964
+
965
+ cmp = 0; // make gcc happy
966
+ switch (frag1->line->blk->page->primaryRot) {
967
+ case 0:
968
+ if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
969
+ cmp = frag1->xMin - frag2->xMin;
970
+ }
971
+ break;
972
+ case 1:
973
+ if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
974
+ cmp = frag1->yMin - frag2->yMin;
975
+ }
976
+ break;
977
+ case 2:
978
+ if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
979
+ cmp = frag2->xMax - frag1->xMax;
980
+ }
981
+ break;
982
+ case 3:
983
+ if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
984
+ cmp = frag2->yMax - frag1->yMax;
985
+ }
986
+ break;
987
+ }
988
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
989
+ }
990
+
991
+ int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
992
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
993
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
994
+ double cmp;
995
+
996
+ cmp = 0; // make gcc happy
997
+ switch (frag1->line->rot) {
998
+ case 0:
999
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1000
+ cmp = frag1->xMin - frag2->xMin;
1001
+ }
1002
+ break;
1003
+ case 1:
1004
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1005
+ cmp = frag1->yMin - frag2->yMin;
1006
+ }
1007
+ break;
1008
+ case 2:
1009
+ if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1010
+ cmp = frag2->xMax - frag1->xMax;
1011
+ }
1012
+ break;
1013
+ case 3:
1014
+ if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1015
+ cmp = frag2->yMax - frag1->yMax;
1016
+ }
1017
+ break;
1018
+ }
1019
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1020
+ }
1021
+
1022
+ int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
1023
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
1024
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
1025
+ double cmp;
1026
+
1027
+ cmp = 0; // make gcc happy
1028
+ switch (frag1->line->rot) {
1029
+ case 0:
1030
+ if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1031
+ cmp = frag1->yMin - frag2->yMin;
1032
+ }
1033
+ break;
1034
+ case 1:
1035
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1036
+ cmp = frag2->xMax - frag1->xMax;
1037
+ }
1038
+ break;
1039
+ case 2:
1040
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1041
+ cmp = frag2->yMin - frag1->yMin;
1042
+ }
1043
+ break;
1044
+ case 3:
1045
+ if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1046
+ cmp = frag1->xMax - frag2->xMax;
1047
+ }
1048
+ break;
1049
+ }
1050
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1051
+ }
1052
+
1053
+ int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2) {
1054
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
1055
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
1056
+ double cmp;
1057
+
1058
+ // if columns overlap, compare y values
1059
+ if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1060
+ frag2->line->col[frag2->start]) &&
1061
+ frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1062
+ frag1->line->col[frag1->start])) {
1063
+ cmp = 0; // make gcc happy
1064
+ switch (frag1->line->blk->page->primaryRot) {
1065
+ case 0: cmp = frag1->yMin - frag2->yMin; break;
1066
+ case 1: cmp = frag2->xMax - frag1->xMax; break;
1067
+ case 2: cmp = frag2->yMin - frag1->yMin; break;
1068
+ case 3: cmp = frag1->xMax - frag2->xMax; break;
1069
+ }
1070
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1071
+ }
1072
+
1073
+ // otherwise, compare starting column
1074
+ return frag1->col - frag2->col;
1075
+ }
1076
+
1077
+ int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2) {
1078
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
1079
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
1080
+ double cmp;
1081
+
1082
+ // if columns overlap, compare y values
1083
+ if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1084
+ frag2->line->col[frag2->start]) &&
1085
+ frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1086
+ frag1->line->col[frag1->start])) {
1087
+ cmp = 0; // make gcc happy
1088
+ switch (frag1->line->rot) {
1089
+ case 0: cmp = frag1->yMin - frag2->yMin; break;
1090
+ case 1: cmp = frag2->xMax - frag1->xMax; break;
1091
+ case 2: cmp = frag2->yMin - frag1->yMin; break;
1092
+ case 3: cmp = frag1->xMax - frag2->xMax; break;
1093
+ }
1094
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1095
+ }
1096
+
1097
+ // otherwise, compare starting column
1098
+ return frag1->col - frag2->col;
1099
+ }
1100
+
1101
+ //------------------------------------------------------------------------
1102
+ // TextBlock
1103
+ //------------------------------------------------------------------------
1104
+
1105
+ TextBlock::TextBlock(TextPage *pageA, int rotA) {
1106
+ page = pageA;
1107
+ rot = rotA;
1108
+ xMin = yMin = 0;
1109
+ xMax = yMax = -1;
1110
+ priMin = 0;
1111
+ priMax = page->pageWidth;
1112
+ pool = new TextPool();
1113
+ lines = NULL;
1114
+ curLine = NULL;
1115
+ next = NULL;
1116
+ stackNext = NULL;
1117
+ }
1118
+
1119
+ TextBlock::~TextBlock() {
1120
+ TextLine *line;
1121
+
1122
+ delete pool;
1123
+ while (lines) {
1124
+ line = lines;
1125
+ lines = lines->next;
1126
+ delete line;
1127
+ }
1128
+ }
1129
+
1130
+ void TextBlock::addWord(TextWord *word) {
1131
+ pool->addWord(word);
1132
+ if (xMin > xMax) {
1133
+ xMin = word->xMin;
1134
+ xMax = word->xMax;
1135
+ yMin = word->yMin;
1136
+ yMax = word->yMax;
1137
+ } else {
1138
+ if (word->xMin < xMin) {
1139
+ xMin = word->xMin;
1140
+ }
1141
+ if (word->xMax > xMax) {
1142
+ xMax = word->xMax;
1143
+ }
1144
+ if (word->yMin < yMin) {
1145
+ yMin = word->yMin;
1146
+ }
1147
+ if (word->yMax > yMax) {
1148
+ yMax = word->yMax;
1149
+ }
1150
+ }
1151
+ }
1152
+
1153
+ void TextBlock::coalesce(UnicodeMap *uMap) {
1154
+ TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1155
+ TextLine *line, *line0, *line1;
1156
+ int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1157
+ int baseIdx, bestWordBaseIdx, idx0, idx1;
1158
+ double minBase, maxBase;
1159
+ double fontSize, delta, priDelta, secDelta;
1160
+ TextLine **lineArray;
1161
+ GBool found;
1162
+ int col1, col2;
1163
+ int i, j, k;
1164
+
1165
+ // discard duplicated text (fake boldface, drop shadows)
1166
+ for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1167
+ word0 = pool->getPool(idx0);
1168
+ while (word0) {
1169
+ priDelta = dupMaxPriDelta * word0->fontSize;
1170
+ secDelta = dupMaxSecDelta * word0->fontSize;
1171
+ if (rot == 0 || rot == 3) {
1172
+ maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1173
+ } else {
1174
+ maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
1175
+ }
1176
+ found = gFalse;
1177
+ word1 = word2 = NULL; // make gcc happy
1178
+ for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1179
+ if (idx1 == idx0) {
1180
+ word1 = word0;
1181
+ word2 = word0->next;
1182
+ } else {
1183
+ word1 = NULL;
1184
+ word2 = pool->getPool(idx1);
1185
+ }
1186
+ for (; word2; word1 = word2, word2 = word2->next) {
1187
+ if (word2->len == word0->len &&
1188
+ !memcmp(word2->text, word0->text,
1189
+ word0->len * sizeof(Unicode))) {
1190
+ switch (rot) {
1191
+ case 0:
1192
+ case 2:
1193
+ found = fabs(word0->xMin - word2->xMin) < priDelta &&
1194
+ fabs(word0->xMax - word2->xMax) < priDelta &&
1195
+ fabs(word0->yMin - word2->yMin) < secDelta &&
1196
+ fabs(word0->yMax - word2->yMax) < secDelta;
1197
+ break;
1198
+ case 1:
1199
+ case 3:
1200
+ found = fabs(word0->xMin - word2->xMin) < secDelta &&
1201
+ fabs(word0->xMax - word2->xMax) < secDelta &&
1202
+ fabs(word0->yMin - word2->yMin) < priDelta &&
1203
+ fabs(word0->yMax - word2->yMax) < priDelta;
1204
+ break;
1205
+ }
1206
+ }
1207
+ if (found) {
1208
+ break;
1209
+ }
1210
+ }
1211
+ if (found) {
1212
+ break;
1213
+ }
1214
+ }
1215
+ if (found) {
1216
+ if (word1) {
1217
+ word1->next = word2->next;
1218
+ } else {
1219
+ pool->setPool(idx1, word2->next);
1220
+ }
1221
+ delete word2;
1222
+ } else {
1223
+ word0 = word0->next;
1224
+ }
1225
+ }
1226
+ }
1227
+
1228
+ // build the lines
1229
+ curLine = NULL;
1230
+ poolMinBaseIdx = pool->minBaseIdx;
1231
+ charCount = 0;
1232
+ nLines = 0;
1233
+ while (1) {
1234
+
1235
+ // find the first non-empty line in the pool
1236
+ for (;
1237
+ poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1238
+ ++poolMinBaseIdx) ;
1239
+ if (poolMinBaseIdx > pool->maxBaseIdx) {
1240
+ break;
1241
+ }
1242
+
1243
+ // look for the left-most word in the first four lines of the
1244
+ // pool -- this avoids starting with a superscript word
1245
+ startBaseIdx = poolMinBaseIdx;
1246
+ for (baseIdx = poolMinBaseIdx + 1;
1247
+ baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1248
+ ++baseIdx) {
1249
+ if (!pool->getPool(baseIdx)) {
1250
+ continue;
1251
+ }
1252
+ if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1253
+ < 0) {
1254
+ startBaseIdx = baseIdx;
1255
+ }
1256
+ }
1257
+
1258
+ // create a new line
1259
+ word0 = pool->getPool(startBaseIdx);
1260
+ pool->setPool(startBaseIdx, word0->next);
1261
+ word0->next = NULL;
1262
+ line = new TextLine(this, word0->rot, word0->base);
1263
+ line->addWord(word0);
1264
+ lastWord = word0;
1265
+
1266
+ // compute the search range
1267
+ fontSize = word0->fontSize;
1268
+ minBase = word0->base - maxIntraLineDelta * fontSize;
1269
+ maxBase = word0->base + maxIntraLineDelta * fontSize;
1270
+ minBaseIdx = pool->getBaseIdx(minBase);
1271
+ maxBaseIdx = pool->getBaseIdx(maxBase);
1272
+
1273
+ // find the rest of the words in this line
1274
+ while (1) {
1275
+
1276
+ // find the left-most word whose baseline is in the range for
1277
+ // this line
1278
+ bestWordBaseIdx = 0;
1279
+ bestWord0 = bestWord1 = NULL;
1280
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
1281
+ for (word0 = NULL, word1 = pool->getPool(baseIdx);
1282
+ word1;
1283
+ word0 = word1, word1 = word1->next) {
1284
+ if (word1->base >= minBase &&
1285
+ word1->base <= maxBase &&
1286
+ (delta = lastWord->primaryDelta(word1)) >=
1287
+ minCharSpacing * fontSize) {
1288
+ if (delta < maxWordSpacing * fontSize &&
1289
+ (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1290
+ bestWordBaseIdx = baseIdx;
1291
+ bestWord0 = word0;
1292
+ bestWord1 = word1;
1293
+ }
1294
+ break;
1295
+ }
1296
+ }
1297
+ }
1298
+ if (!bestWord1) {
1299
+ break;
1300
+ }
1301
+
1302
+ // remove it from the pool, and add it to the line
1303
+ if (bestWord0) {
1304
+ bestWord0->next = bestWord1->next;
1305
+ } else {
1306
+ pool->setPool(bestWordBaseIdx, bestWord1->next);
1307
+ }
1308
+ bestWord1->next = NULL;
1309
+ line->addWord(bestWord1);
1310
+ lastWord = bestWord1;
1311
+ }
1312
+
1313
+ // add the line
1314
+ if (curLine && line->cmpYX(curLine) > 0) {
1315
+ line0 = curLine;
1316
+ line1 = curLine->next;
1317
+ } else {
1318
+ line0 = NULL;
1319
+ line1 = lines;
1320
+ }
1321
+ for (;
1322
+ line1 && line->cmpYX(line1) > 0;
1323
+ line0 = line1, line1 = line1->next) ;
1324
+ if (line0) {
1325
+ line0->next = line;
1326
+ } else {
1327
+ lines = line;
1328
+ }
1329
+ line->next = line1;
1330
+ curLine = line;
1331
+ line->coalesce(uMap);
1332
+ charCount += line->len;
1333
+ ++nLines;
1334
+ }
1335
+
1336
+ // sort lines into xy order for column assignment
1337
+ lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1338
+ for (line = lines, i = 0; line; line = line->next, ++i) {
1339
+ lineArray[i] = line;
1340
+ }
1341
+ qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1342
+
1343
+ // column assignment
1344
+ nColumns = 0;
1345
+ for (i = 0; i < nLines; ++i) {
1346
+ line0 = lineArray[i];
1347
+ col1 = 0;
1348
+ for (j = 0; j < i; ++j) {
1349
+ line1 = lineArray[j];
1350
+ if (line1->primaryDelta(line0) >= 0) {
1351
+ col2 = line1->col[line1->len] + 1;
1352
+ } else {
1353
+ k = 0; // make gcc happy
1354
+ switch (rot) {
1355
+ case 0:
1356
+ for (k = 0;
1357
+ k < line1->len &&
1358
+ line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1359
+ ++k) ;
1360
+ break;
1361
+ case 1:
1362
+ for (k = 0;
1363
+ k < line1->len &&
1364
+ line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1365
+ ++k) ;
1366
+ break;
1367
+ case 2:
1368
+ for (k = 0;
1369
+ k < line1->len &&
1370
+ line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1371
+ ++k) ;
1372
+ break;
1373
+ case 3:
1374
+ for (k = 0;
1375
+ k < line1->len &&
1376
+ line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1377
+ ++k) ;
1378
+ break;
1379
+ }
1380
+ col2 = line1->col[k];
1381
+ }
1382
+ if (col2 > col1) {
1383
+ col1 = col2;
1384
+ }
1385
+ }
1386
+ for (k = 0; k <= line0->len; ++k) {
1387
+ line0->col[k] += col1;
1388
+ }
1389
+ if (line0->col[line0->len] > nColumns) {
1390
+ nColumns = line0->col[line0->len];
1391
+ }
1392
+ }
1393
+ gfree(lineArray);
1394
+ }
1395
+
1396
+ void TextBlock::updatePriMinMax(TextBlock *blk) {
1397
+ double newPriMin, newPriMax;
1398
+ GBool gotPriMin, gotPriMax;
1399
+
1400
+ gotPriMin = gotPriMax = gFalse;
1401
+ newPriMin = newPriMax = 0; // make gcc happy
1402
+ switch (page->primaryRot) {
1403
+ case 0:
1404
+ case 2:
1405
+ if (blk->yMin < yMax && blk->yMax > yMin) {
1406
+ if (blk->xMin < xMin) {
1407
+ newPriMin = blk->xMax;
1408
+ gotPriMin = gTrue;
1409
+ }
1410
+ if (blk->xMax > xMax) {
1411
+ newPriMax = blk->xMin;
1412
+ gotPriMax = gTrue;
1413
+ }
1414
+ }
1415
+ break;
1416
+ case 1:
1417
+ case 3:
1418
+ if (blk->xMin < xMax && blk->xMax > xMin) {
1419
+ if (blk->yMin < yMin) {
1420
+ newPriMin = blk->yMax;
1421
+ gotPriMin = gTrue;
1422
+ }
1423
+ if (blk->yMax > yMax) {
1424
+ newPriMax = blk->yMin;
1425
+ gotPriMax = gTrue;
1426
+ }
1427
+ }
1428
+ break;
1429
+ }
1430
+ if (gotPriMin) {
1431
+ if (newPriMin > xMin) {
1432
+ newPriMin = xMin;
1433
+ }
1434
+ if (newPriMin > priMin) {
1435
+ priMin = newPriMin;
1436
+ }
1437
+ }
1438
+ if (gotPriMax) {
1439
+ if (newPriMax < xMax) {
1440
+ newPriMax = xMax;
1441
+ }
1442
+ if (newPriMax < priMax) {
1443
+ priMax = newPriMax;
1444
+ }
1445
+ }
1446
+ }
1447
+
1448
+ int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1449
+ TextBlock *blk1 = *(TextBlock **)p1;
1450
+ TextBlock *blk2 = *(TextBlock **)p2;
1451
+ double cmp;
1452
+
1453
+ cmp = 0; // make gcc happy
1454
+ switch (blk1->page->primaryRot) {
1455
+ case 0:
1456
+ if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1457
+ cmp = blk1->yMin - blk2->yMin;
1458
+ }
1459
+ break;
1460
+ case 1:
1461
+ if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1462
+ cmp = blk2->xMax - blk1->xMax;
1463
+ }
1464
+ break;
1465
+ case 2:
1466
+ if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1467
+ cmp = blk2->yMin - blk1->yMin;
1468
+ }
1469
+ break;
1470
+ case 3:
1471
+ if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1472
+ cmp = blk1->xMax - blk2->xMax;
1473
+ }
1474
+ break;
1475
+ }
1476
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1477
+ }
1478
+
1479
+ int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1480
+ TextBlock *blk1 = *(TextBlock **)p1;
1481
+ TextBlock *blk2 = *(TextBlock **)p2;
1482
+ double cmp;
1483
+
1484
+ cmp = 0; // make gcc happy
1485
+ switch (blk1->page->primaryRot) {
1486
+ case 0:
1487
+ if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1488
+ cmp = blk1->xMin - blk2->xMin;
1489
+ }
1490
+ break;
1491
+ case 1:
1492
+ if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1493
+ cmp = blk1->yMin - blk2->yMin;
1494
+ }
1495
+ break;
1496
+ case 2:
1497
+ if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1498
+ cmp = blk2->xMax - blk1->xMax;
1499
+ }
1500
+ break;
1501
+ case 3:
1502
+ if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1503
+ cmp = blk2->yMax - blk1->yMax;
1504
+ }
1505
+ break;
1506
+ }
1507
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1508
+ }
1509
+
1510
+ int TextBlock::primaryCmp(TextBlock *blk) {
1511
+ double cmp;
1512
+
1513
+ cmp = 0; // make gcc happy
1514
+ switch (rot) {
1515
+ case 0:
1516
+ cmp = xMin - blk->xMin;
1517
+ break;
1518
+ case 1:
1519
+ cmp = yMin - blk->yMin;
1520
+ break;
1521
+ case 2:
1522
+ cmp = blk->xMax - xMax;
1523
+ break;
1524
+ case 3:
1525
+ cmp = blk->yMax - yMax;
1526
+ break;
1527
+ }
1528
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1529
+ }
1530
+
1531
+ double TextBlock::secondaryDelta(TextBlock *blk) {
1532
+ double delta;
1533
+
1534
+ delta = 0; // make gcc happy
1535
+ switch (rot) {
1536
+ case 0:
1537
+ delta = blk->yMin - yMax;
1538
+ break;
1539
+ case 1:
1540
+ delta = xMin - blk->xMax;
1541
+ break;
1542
+ case 2:
1543
+ delta = yMin - blk->yMax;
1544
+ break;
1545
+ case 3:
1546
+ delta = blk->xMin - xMax;
1547
+ break;
1548
+ }
1549
+ return delta;
1550
+ }
1551
+
1552
+ GBool TextBlock::isBelow(TextBlock *blk) {
1553
+ GBool below;
1554
+
1555
+ below = gFalse; // make gcc happy
1556
+ switch (page->primaryRot) {
1557
+ case 0:
1558
+ below = xMin >= blk->priMin && xMax <= blk->priMax &&
1559
+ yMin > blk->yMin;
1560
+ break;
1561
+ case 1:
1562
+ below = yMin >= blk->priMin && yMax <= blk->priMax &&
1563
+ xMax < blk->xMax;
1564
+ break;
1565
+ case 2:
1566
+ below = xMin >= blk->priMin && xMax <= blk->priMax &&
1567
+ yMax < blk->yMax;
1568
+ break;
1569
+ case 3:
1570
+ below = yMin >= blk->priMin && yMax <= blk->priMax &&
1571
+ xMin > blk->xMin;
1572
+ break;
1573
+ }
1574
+
1575
+ return below;
1576
+ }
1577
+
1578
+ //------------------------------------------------------------------------
1579
+ // TextFlow
1580
+ //------------------------------------------------------------------------
1581
+
1582
+ TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
1583
+ page = pageA;
1584
+ xMin = blk->xMin;
1585
+ xMax = blk->xMax;
1586
+ yMin = blk->yMin;
1587
+ yMax = blk->yMax;
1588
+ priMin = blk->priMin;
1589
+ priMax = blk->priMax;
1590
+ blocks = lastBlk = blk;
1591
+ next = NULL;
1592
+ }
1593
+
1594
+ TextFlow::~TextFlow() {
1595
+ TextBlock *blk;
1596
+
1597
+ while (blocks) {
1598
+ blk = blocks;
1599
+ blocks = blocks->next;
1600
+ delete blk;
1601
+ }
1602
+ }
1603
+
1604
+ void TextFlow::addBlock(TextBlock *blk) {
1605
+ if (lastBlk) {
1606
+ lastBlk->next = blk;
1607
+ } else {
1608
+ blocks = blk;
1609
+ }
1610
+ lastBlk = blk;
1611
+ if (blk->xMin < xMin) {
1612
+ xMin = blk->xMin;
1613
+ }
1614
+ if (blk->xMax > xMax) {
1615
+ xMax = blk->xMax;
1616
+ }
1617
+ if (blk->yMin < yMin) {
1618
+ yMin = blk->yMin;
1619
+ }
1620
+ if (blk->yMax > yMax) {
1621
+ yMax = blk->yMax;
1622
+ }
1623
+ }
1624
+
1625
+ GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
1626
+ GBool fits;
1627
+
1628
+ // lower blocks must use smaller fonts
1629
+ if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
1630
+ return gFalse;
1631
+ }
1632
+
1633
+ fits = gFalse; // make gcc happy
1634
+ switch (page->primaryRot) {
1635
+ case 0:
1636
+ fits = blk->xMin >= priMin && blk->xMax <= priMax;
1637
+ break;
1638
+ case 1:
1639
+ fits = blk->yMin >= priMin && blk->yMax <= priMax;
1640
+ break;
1641
+ case 2:
1642
+ fits = blk->xMin >= priMin && blk->xMax <= priMax;
1643
+ break;
1644
+ case 3:
1645
+ fits = blk->yMin >= priMin && blk->yMax <= priMax;
1646
+ break;
1647
+ }
1648
+ return fits;
1649
+ }
1650
+
1651
+ #if TEXTOUT_WORD_LIST
1652
+
1653
+ //------------------------------------------------------------------------
1654
+ // TextWordList
1655
+ //------------------------------------------------------------------------
1656
+
1657
+ TextWordList::TextWordList(TextPage *text, GBool physLayout) {
1658
+ TextFlow *flow;
1659
+ TextBlock *blk;
1660
+ TextLine *line;
1661
+ TextWord *word;
1662
+ TextWord **wordArray;
1663
+ int nWords, i;
1664
+
1665
+ words = new GList();
1666
+
1667
+ if (text->rawOrder) {
1668
+ for (word = text->rawWords; word; word = word->next) {
1669
+ words->append(word);
1670
+ }
1671
+
1672
+ } else if (physLayout) {
1673
+ // this is inefficient, but it's also the least useful of these
1674
+ // three cases
1675
+ nWords = 0;
1676
+ for (flow = text->flows; flow; flow = flow->next) {
1677
+ for (blk = flow->blocks; blk; blk = blk->next) {
1678
+ for (line = blk->lines; line; line = line->next) {
1679
+ for (word = line->words; word; word = word->next) {
1680
+ ++nWords;
1681
+ }
1682
+ }
1683
+ }
1684
+ }
1685
+ wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
1686
+ i = 0;
1687
+ for (flow = text->flows; flow; flow = flow->next) {
1688
+ for (blk = flow->blocks; blk; blk = blk->next) {
1689
+ for (line = blk->lines; line; line = line->next) {
1690
+ for (word = line->words; word; word = word->next) {
1691
+ wordArray[i++] = word;
1692
+ }
1693
+ }
1694
+ }
1695
+ }
1696
+ qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
1697
+ for (i = 0; i < nWords; ++i) {
1698
+ words->append(wordArray[i]);
1699
+ }
1700
+ gfree(wordArray);
1701
+
1702
+ } else {
1703
+ for (flow = text->flows; flow; flow = flow->next) {
1704
+ for (blk = flow->blocks; blk; blk = blk->next) {
1705
+ for (line = blk->lines; line; line = line->next) {
1706
+ for (word = line->words; word; word = word->next) {
1707
+ words->append(word);
1708
+ }
1709
+ }
1710
+ }
1711
+ }
1712
+ }
1713
+ }
1714
+
1715
+ TextWordList::~TextWordList() {
1716
+ delete words;
1717
+ }
1718
+
1719
+ int TextWordList::getLength() {
1720
+ return words->getLength();
1721
+ }
1722
+
1723
+ TextWord *TextWordList::get(int idx) {
1724
+ if (idx < 0 || idx >= words->getLength()) {
1725
+ return NULL;
1726
+ }
1727
+ return (TextWord *)words->get(idx);
1728
+ }
1729
+
1730
+ #endif // TEXTOUT_WORD_LIST
1731
+
1732
+ //------------------------------------------------------------------------
1733
+ // TextPage
1734
+ //------------------------------------------------------------------------
1735
+
1736
+ TextPage::TextPage(GBool rawOrderA) {
1737
+ int rot;
1738
+
1739
+ rawOrder = rawOrderA;
1740
+ curWord = NULL;
1741
+ charPos = 0;
1742
+ curFont = NULL;
1743
+ curFontSize = 0;
1744
+ nest = 0;
1745
+ nTinyChars = 0;
1746
+ lastCharOverlap = gFalse;
1747
+ if (!rawOrder) {
1748
+ for (rot = 0; rot < 4; ++rot) {
1749
+ pools[rot] = new TextPool();
1750
+ }
1751
+ }
1752
+ flows = NULL;
1753
+ blocks = NULL;
1754
+ rawWords = NULL;
1755
+ rawLastWord = NULL;
1756
+ fonts = new GList();
1757
+ lastFindXMin = lastFindYMin = 0;
1758
+ haveLastFind = gFalse;
1759
+ underlines = new GList();
1760
+ links = new GList();
1761
+ }
1762
+
1763
+ TextPage::~TextPage() {
1764
+ int rot;
1765
+
1766
+ clear();
1767
+ if (!rawOrder) {
1768
+ for (rot = 0; rot < 4; ++rot) {
1769
+ delete pools[rot];
1770
+ }
1771
+ }
1772
+ delete fonts;
1773
+ deleteGList(underlines, TextUnderline);
1774
+ deleteGList(links, TextLink);
1775
+ }
1776
+
1777
+ void TextPage::startPage(GfxState *state) {
1778
+ clear();
1779
+ if (state) {
1780
+ pageWidth = state->getPageWidth();
1781
+ pageHeight = state->getPageHeight();
1782
+ } else {
1783
+ pageWidth = pageHeight = 0;
1784
+ }
1785
+ }
1786
+
1787
+ void TextPage::endPage() {
1788
+ if (curWord) {
1789
+ endWord();
1790
+ }
1791
+ }
1792
+
1793
+ void TextPage::clear() {
1794
+ int rot;
1795
+ TextFlow *flow;
1796
+ TextWord *word;
1797
+
1798
+ if (curWord) {
1799
+ delete curWord;
1800
+ curWord = NULL;
1801
+ }
1802
+ if (rawOrder) {
1803
+ while (rawWords) {
1804
+ word = rawWords;
1805
+ rawWords = rawWords->next;
1806
+ delete word;
1807
+ }
1808
+ } else {
1809
+ for (rot = 0; rot < 4; ++rot) {
1810
+ delete pools[rot];
1811
+ }
1812
+ while (flows) {
1813
+ flow = flows;
1814
+ flows = flows->next;
1815
+ delete flow;
1816
+ }
1817
+ gfree(blocks);
1818
+ }
1819
+ deleteGList(fonts, TextFontInfo);
1820
+
1821
+ curWord = NULL;
1822
+ charPos = 0;
1823
+ curFont = NULL;
1824
+ curFontSize = 0;
1825
+ nest = 0;
1826
+ nTinyChars = 0;
1827
+ if (!rawOrder) {
1828
+ for (rot = 0; rot < 4; ++rot) {
1829
+ pools[rot] = new TextPool();
1830
+ }
1831
+ }
1832
+ flows = NULL;
1833
+ blocks = NULL;
1834
+ rawWords = NULL;
1835
+ rawLastWord = NULL;
1836
+ fonts = new GList();
1837
+ }
1838
+
1839
+ void TextPage::updateFont(GfxState *state) {
1840
+ GfxFont *gfxFont;
1841
+ double *fm;
1842
+ char *name;
1843
+ int code, mCode, letterCode, anyCode;
1844
+ double w;
1845
+ int i;
1846
+
1847
+ // get the font info object
1848
+ curFont = NULL;
1849
+ for (i = 0; i < fonts->getLength(); ++i) {
1850
+ curFont = (TextFontInfo *)fonts->get(i);
1851
+ if (curFont->matches(state)) {
1852
+ break;
1853
+ }
1854
+ curFont = NULL;
1855
+ }
1856
+ if (!curFont) {
1857
+ curFont = new TextFontInfo(state);
1858
+ fonts->append(curFont);
1859
+ }
1860
+
1861
+ // adjust the font size
1862
+ gfxFont = state->getFont();
1863
+ curFontSize = state->getTransformedFontSize();
1864
+ if (gfxFont && gfxFont->getType() == fontType3) {
1865
+ // This is a hack which makes it possible to deal with some Type 3
1866
+ // fonts. The problem is that it's impossible to know what the
1867
+ // base coordinate system used in the font is without actually
1868
+ // rendering the font. This code tries to guess by looking at the
1869
+ // width of the character 'm' (which breaks if the font is a
1870
+ // subset that doesn't contain 'm').
1871
+ mCode = letterCode = anyCode = -1;
1872
+ for (code = 0; code < 256; ++code) {
1873
+ name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1874
+ if (name && name[0] == 'm' && name[1] == '\0') {
1875
+ mCode = code;
1876
+ }
1877
+ if (letterCode < 0 && name && name[1] == '\0' &&
1878
+ ((name[0] >= 'A' && name[0] <= 'Z') ||
1879
+ (name[0] >= 'a' && name[0] <= 'z'))) {
1880
+ letterCode = code;
1881
+ }
1882
+ if (anyCode < 0 && name &&
1883
+ ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
1884
+ anyCode = code;
1885
+ }
1886
+ }
1887
+ if (mCode >= 0 &&
1888
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
1889
+ // 0.6 is a generic average 'm' width -- yes, this is a hack
1890
+ curFontSize *= w / 0.6;
1891
+ } else if (letterCode >= 0 &&
1892
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
1893
+ // even more of a hack: 0.5 is a generic letter width
1894
+ curFontSize *= w / 0.5;
1895
+ } else if (anyCode >= 0 &&
1896
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
1897
+ // better than nothing: 0.5 is a generic character width
1898
+ curFontSize *= w / 0.5;
1899
+ }
1900
+ fm = gfxFont->getFontMatrix();
1901
+ if (fm[0] != 0) {
1902
+ curFontSize *= fabs(fm[3] / fm[0]);
1903
+ }
1904
+ }
1905
+ }
1906
+
1907
+ void TextPage::beginWord(GfxState *state, double x0, double y0) {
1908
+ double *fontm;
1909
+ double m[4], m2[4];
1910
+ int rot;
1911
+
1912
+ // This check is needed because Type 3 characters can contain
1913
+ // text-drawing operations (when TextPage is being used via
1914
+ // {X,Win}SplashOutputDev rather than TextOutputDev).
1915
+ if (curWord) {
1916
+ ++nest;
1917
+ return;
1918
+ }
1919
+
1920
+ // compute the rotation
1921
+ state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
1922
+ if (state->getFont()->getType() == fontType3) {
1923
+ fontm = state->getFont()->getFontMatrix();
1924
+ m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
1925
+ m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
1926
+ m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
1927
+ m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
1928
+ m[0] = m2[0];
1929
+ m[1] = m2[1];
1930
+ m[2] = m2[2];
1931
+ m[3] = m2[3];
1932
+ }
1933
+ if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
1934
+ rot = (m[3] < 0) ? 0 : 2;
1935
+ } else {
1936
+ rot = (m[2] > 0) ? 1 : 3;
1937
+ }
1938
+
1939
+ curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
1940
+ }
1941
+
1942
+ void TextPage::addChar(GfxState *state, double x, double y,
1943
+ double dx, double dy,
1944
+ CharCode c, int nBytes, Unicode *u, int uLen) {
1945
+ double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
1946
+ GBool overlap;
1947
+ int i;
1948
+
1949
+ // subtract char and word spacing from the dx,dy values
1950
+ sp = state->getCharSpace();
1951
+ if (c == (CharCode)0x20) {
1952
+ sp += state->getWordSpace();
1953
+ }
1954
+ state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1955
+ dx -= dx2;
1956
+ dy -= dy2;
1957
+ state->transformDelta(dx, dy, &w1, &h1);
1958
+
1959
+ // throw away chars that aren't inside the page bounds
1960
+ // (and also do a sanity check on the character size)
1961
+ state->transform(x, y, &x1, &y1);
1962
+ if (x1 + w1 < 0 || x1 > pageWidth ||
1963
+ y1 + h1 < 0 || y1 > pageHeight ||
1964
+ w1 > pageWidth || h1 > pageHeight) {
1965
+ charPos += nBytes;
1966
+ return;
1967
+ }
1968
+
1969
+ // check the tiny chars limit
1970
+ if (!globalParams->getTextKeepTinyChars() &&
1971
+ fabs(w1) < 3 && fabs(h1) < 3) {
1972
+ if (++nTinyChars > 50000) {
1973
+ charPos += nBytes;
1974
+ return;
1975
+ }
1976
+ }
1977
+
1978
+ // break words at space character
1979
+ if (uLen == 1 && u[0] == (Unicode)0x20) {
1980
+ if (curWord) {
1981
+ ++curWord->charLen;
1982
+ }
1983
+ charPos += nBytes;
1984
+ endWord();
1985
+ return;
1986
+ }
1987
+
1988
+ // start a new word if:
1989
+ // (1) this character doesn't fall in the right place relative to
1990
+ // the end of the previous word (this places upper and lower
1991
+ // constraints on the position deltas along both the primary
1992
+ // and secondary axes), or
1993
+ // (2) this character overlaps the previous one (duplicated text), or
1994
+ // (3) the previous character was an overlap (we want each duplicated
1995
+ // character to be in a word by itself at this stage),
1996
+ // (4) the font size has changed
1997
+ if (curWord && curWord->len > 0) {
1998
+ base = sp = delta = 0; // make gcc happy
1999
+ switch (curWord->rot) {
2000
+ case 0:
2001
+ base = y1;
2002
+ sp = x1 - curWord->xMax;
2003
+ delta = x1 - curWord->edge[curWord->len - 1];
2004
+ break;
2005
+ case 1:
2006
+ base = x1;
2007
+ sp = y1 - curWord->yMax;
2008
+ delta = y1 - curWord->edge[curWord->len - 1];
2009
+ break;
2010
+ case 2:
2011
+ base = y1;
2012
+ sp = curWord->xMin - x1;
2013
+ delta = curWord->edge[curWord->len - 1] - x1;
2014
+ break;
2015
+ case 3:
2016
+ base = x1;
2017
+ sp = curWord->yMin - y1;
2018
+ delta = curWord->edge[curWord->len - 1] - y1;
2019
+ break;
2020
+ }
2021
+ overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
2022
+ fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2023
+ if (overlap || lastCharOverlap ||
2024
+ sp < -minDupBreakOverlap * curWord->fontSize ||
2025
+ sp > minWordBreakSpace * curWord->fontSize ||
2026
+ fabs(base - curWord->base) > 0.5 ||
2027
+ curFontSize != curWord->fontSize) {
2028
+ endWord();
2029
+ }
2030
+ lastCharOverlap = overlap;
2031
+ } else {
2032
+ lastCharOverlap = gFalse;
2033
+ }
2034
+
2035
+ if (uLen != 0) {
2036
+ // start a new word if needed
2037
+ if (!curWord) {
2038
+ beginWord(state, x, y);
2039
+ }
2040
+
2041
+ // page rotation and/or transform matrices can cause text to be
2042
+ // drawn in reverse order -- in this case, swap the begin/end
2043
+ // coordinates and break text into individual chars
2044
+ if ((curWord->rot == 0 && w1 < 0) ||
2045
+ (curWord->rot == 1 && h1 < 0) ||
2046
+ (curWord->rot == 2 && w1 > 0) ||
2047
+ (curWord->rot == 3 && h1 > 0)) {
2048
+ endWord();
2049
+ beginWord(state, x + dx, y + dy);
2050
+ x1 += w1;
2051
+ y1 += h1;
2052
+ w1 = -w1;
2053
+ h1 = -h1;
2054
+ }
2055
+
2056
+ // add the characters to the current word
2057
+ w1 /= uLen;
2058
+ h1 /= uLen;
2059
+ for (i = 0; i < uLen; ++i) {
2060
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
2061
+ }
2062
+ }
2063
+ if (curWord) {
2064
+ curWord->charLen += nBytes;
2065
+ }
2066
+ charPos += nBytes;
2067
+ }
2068
+
2069
+ void TextPage::endWord() {
2070
+ // This check is needed because Type 3 characters can contain
2071
+ // text-drawing operations (when TextPage is being used via
2072
+ // {X,Win}SplashOutputDev rather than TextOutputDev).
2073
+ if (nest > 0) {
2074
+ --nest;
2075
+ return;
2076
+ }
2077
+
2078
+ if (curWord) {
2079
+ addWord(curWord);
2080
+ curWord = NULL;
2081
+ }
2082
+ }
2083
+
2084
+ void TextPage::addWord(TextWord *word) {
2085
+ // throw away zero-length words -- they don't have valid xMin/xMax
2086
+ // values, and they're useless anyway
2087
+ if (word->len == 0) {
2088
+ delete word;
2089
+ return;
2090
+ }
2091
+
2092
+ if (rawOrder) {
2093
+ if (rawLastWord) {
2094
+ rawLastWord->next = word;
2095
+ } else {
2096
+ rawWords = word;
2097
+ }
2098
+ rawLastWord = word;
2099
+ } else {
2100
+ pools[word->rot]->addWord(word);
2101
+ }
2102
+ }
2103
+
2104
+ void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
2105
+ underlines->append(new TextUnderline(x0, y0, x1, y1));
2106
+ }
2107
+
2108
+ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, Link *link) {
2109
+ links->append(new TextLink(xMin, yMin, xMax, yMax, link));
2110
+ }
2111
+
2112
+ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
2113
+ UnicodeMap *uMap;
2114
+ TextPool *pool;
2115
+ TextWord *word0, *word1, *word2;
2116
+ TextLine *line;
2117
+ TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;
2118
+ TextBlock **blkArray;
2119
+ TextFlow *flow, *lastFlow;
2120
+ TextUnderline *underline;
2121
+ TextLink *link;
2122
+ int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2123
+ double minBase, maxBase, newMinBase, newMaxBase;
2124
+ double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2125
+ GBool found;
2126
+ int count[4];
2127
+ int lrCount;
2128
+ int firstBlkIdx, nBlocksLeft;
2129
+ int col1, col2;
2130
+ int i, j, n;
2131
+
2132
+ if (rawOrder) {
2133
+ primaryRot = 0;
2134
+ primaryLR = gTrue;
2135
+ return;
2136
+ }
2137
+
2138
+ uMap = globalParams->getTextEncoding();
2139
+ blkList = NULL;
2140
+ lastBlk = NULL;
2141
+ nBlocks = 0;
2142
+ primaryRot = -1;
2143
+
2144
+ #if 0 // for debugging
2145
+ printf("*** initial words ***\n");
2146
+ for (rot = 0; rot < 4; ++rot) {
2147
+ pool = pools[rot];
2148
+ for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2149
+ for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2150
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2151
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2152
+ word0->base, word0->fontSize, rot*90, word0->link);
2153
+ for (i = 0; i < word0->len; ++i) {
2154
+ fputc(word0->text[i] & 0xff, stdout);
2155
+ }
2156
+ printf("'\n");
2157
+ }
2158
+ }
2159
+ }
2160
+ printf("\n");
2161
+ #endif
2162
+
2163
+ #if 0 //~ for debugging
2164
+ for (i = 0; i < underlines->getLength(); ++i) {
2165
+ underline = (TextUnderline *)underlines->get(i);
2166
+ printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2167
+ underline->x0, underline->x1, underline->y0, underline->y1,
2168
+ underline->horiz);
2169
+ }
2170
+ #endif
2171
+
2172
+ if (doHTML) {
2173
+
2174
+ //----- handle underlining
2175
+ for (i = 0; i < underlines->getLength(); ++i) {
2176
+ underline = (TextUnderline *)underlines->get(i);
2177
+ if (underline->horiz) {
2178
+ // rot = 0
2179
+ if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2180
+ startBaseIdx = pools[0]->getBaseIdx(underline->y0 + minUnderlineGap);
2181
+ endBaseIdx = pools[0]->getBaseIdx(underline->y0 + maxUnderlineGap);
2182
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2183
+ for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2184
+ //~ need to check the y value against the word baseline
2185
+ if (underline->x0 < word0->xMin + underlineSlack &&
2186
+ word0->xMax - underlineSlack < underline->x1) {
2187
+ word0->underlined = gTrue;
2188
+ }
2189
+ }
2190
+ }
2191
+ }
2192
+
2193
+ // rot = 2
2194
+ if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2195
+ startBaseIdx = pools[2]->getBaseIdx(underline->y0 - maxUnderlineGap);
2196
+ endBaseIdx = pools[2]->getBaseIdx(underline->y0 - minUnderlineGap);
2197
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2198
+ for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2199
+ if (underline->x0 < word0->xMin + underlineSlack &&
2200
+ word0->xMax - underlineSlack < underline->x1) {
2201
+ word0->underlined = gTrue;
2202
+ }
2203
+ }
2204
+ }
2205
+ }
2206
+ } else {
2207
+ // rot = 1
2208
+ if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2209
+ startBaseIdx = pools[1]->getBaseIdx(underline->x0 - maxUnderlineGap);
2210
+ endBaseIdx = pools[1]->getBaseIdx(underline->x0 - minUnderlineGap);
2211
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2212
+ for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2213
+ if (underline->y0 < word0->yMin + underlineSlack &&
2214
+ word0->yMax - underlineSlack < underline->y1) {
2215
+ word0->underlined = gTrue;
2216
+ }
2217
+ }
2218
+ }
2219
+ }
2220
+
2221
+ // rot = 3
2222
+ if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2223
+ startBaseIdx = pools[3]->getBaseIdx(underline->x0 + minUnderlineGap);
2224
+ endBaseIdx = pools[3]->getBaseIdx(underline->x0 + maxUnderlineGap);
2225
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2226
+ for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2227
+ if (underline->y0 < word0->yMin + underlineSlack &&
2228
+ word0->yMax - underlineSlack < underline->y1) {
2229
+ word0->underlined = gTrue;
2230
+ }
2231
+ }
2232
+ }
2233
+ }
2234
+ }
2235
+ }
2236
+
2237
+ //----- handle links
2238
+ for (i = 0; i < links->getLength(); ++i) {
2239
+ link = (TextLink *)links->get(i);
2240
+
2241
+ // rot = 0
2242
+ if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2243
+ startBaseIdx = pools[0]->getBaseIdx(link->yMin);
2244
+ endBaseIdx = pools[0]->getBaseIdx(link->yMax);
2245
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2246
+ for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2247
+ if (link->xMin < word0->xMin + hyperlinkSlack &&
2248
+ word0->xMax - hyperlinkSlack < link->xMax &&
2249
+ link->yMin < word0->yMin + hyperlinkSlack &&
2250
+ word0->yMax - hyperlinkSlack < link->yMax) {
2251
+ word0->link = link->link;
2252
+ }
2253
+ }
2254
+ }
2255
+ }
2256
+
2257
+ // rot = 2
2258
+ if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2259
+ startBaseIdx = pools[2]->getBaseIdx(link->yMin);
2260
+ endBaseIdx = pools[2]->getBaseIdx(link->yMax);
2261
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2262
+ for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2263
+ if (link->xMin < word0->xMin + hyperlinkSlack &&
2264
+ word0->xMax - hyperlinkSlack < link->xMax &&
2265
+ link->yMin < word0->yMin + hyperlinkSlack &&
2266
+ word0->yMax - hyperlinkSlack < link->yMax) {
2267
+ word0->link = link->link;
2268
+ }
2269
+ }
2270
+ }
2271
+ }
2272
+
2273
+ // rot = 1
2274
+ if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2275
+ startBaseIdx = pools[1]->getBaseIdx(link->xMin);
2276
+ endBaseIdx = pools[1]->getBaseIdx(link->xMax);
2277
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2278
+ for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2279
+ if (link->yMin < word0->yMin + hyperlinkSlack &&
2280
+ word0->yMax - hyperlinkSlack < link->yMax &&
2281
+ link->xMin < word0->xMin + hyperlinkSlack &&
2282
+ word0->xMax - hyperlinkSlack < link->xMax) {
2283
+ word0->link = link->link;
2284
+ }
2285
+ }
2286
+ }
2287
+ }
2288
+
2289
+ // rot = 3
2290
+ if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2291
+ startBaseIdx = pools[3]->getBaseIdx(link->xMin);
2292
+ endBaseIdx = pools[3]->getBaseIdx(link->xMax);
2293
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2294
+ for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2295
+ if (link->yMin < word0->yMin + hyperlinkSlack &&
2296
+ word0->yMax - hyperlinkSlack < link->yMax &&
2297
+ link->xMin < word0->xMin + hyperlinkSlack &&
2298
+ word0->xMax - hyperlinkSlack < link->xMax) {
2299
+ word0->link = link->link;
2300
+ }
2301
+ }
2302
+ }
2303
+ }
2304
+ }
2305
+ }
2306
+
2307
+ //----- assemble the blocks
2308
+
2309
+ //~ add an outer loop for writing mode (vertical text)
2310
+
2311
+ // build blocks for each rotation value
2312
+ for (rot = 0; rot < 4; ++rot) {
2313
+ pool = pools[rot];
2314
+ poolMinBaseIdx = pool->minBaseIdx;
2315
+ count[rot] = 0;
2316
+
2317
+ // add blocks until no more words are left
2318
+ while (1) {
2319
+
2320
+ // find the first non-empty line in the pool
2321
+ for (;
2322
+ poolMinBaseIdx <= pool->maxBaseIdx &&
2323
+ !pool->getPool(poolMinBaseIdx);
2324
+ ++poolMinBaseIdx) ;
2325
+ if (poolMinBaseIdx > pool->maxBaseIdx) {
2326
+ break;
2327
+ }
2328
+
2329
+ // look for the left-most word in the first four lines of the
2330
+ // pool -- this avoids starting with a superscript word
2331
+ startBaseIdx = poolMinBaseIdx;
2332
+ for (baseIdx = poolMinBaseIdx + 1;
2333
+ baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
2334
+ ++baseIdx) {
2335
+ if (!pool->getPool(baseIdx)) {
2336
+ continue;
2337
+ }
2338
+ if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
2339
+ < 0) {
2340
+ startBaseIdx = baseIdx;
2341
+ }
2342
+ }
2343
+
2344
+ // create a new block
2345
+ word0 = pool->getPool(startBaseIdx);
2346
+ pool->setPool(startBaseIdx, word0->next);
2347
+ word0->next = NULL;
2348
+ blk = new TextBlock(this, rot);
2349
+ blk->addWord(word0);
2350
+
2351
+ fontSize = word0->fontSize;
2352
+ minBase = maxBase = word0->base;
2353
+ colSpace1 = minColSpacing1 * fontSize;
2354
+ colSpace2 = minColSpacing2 * fontSize;
2355
+ lineSpace = maxLineSpacingDelta * fontSize;
2356
+ intraLineSpace = maxIntraLineDelta * fontSize;
2357
+
2358
+ // add words to the block
2359
+ do {
2360
+ found = gFalse;
2361
+
2362
+ // look for words on the line above the current top edge of
2363
+ // the block
2364
+ newMinBase = minBase;
2365
+ for (baseIdx = pool->getBaseIdx(minBase);
2366
+ baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2367
+ --baseIdx) {
2368
+ word0 = NULL;
2369
+ word1 = pool->getPool(baseIdx);
2370
+ while (word1) {
2371
+ if (word1->base < minBase &&
2372
+ word1->base >= minBase - lineSpace &&
2373
+ ((rot == 0 || rot == 2)
2374
+ ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2375
+ : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2376
+ fabs(word1->fontSize - fontSize) <
2377
+ maxBlockFontSizeDelta1 * fontSize) {
2378
+ word2 = word1;
2379
+ if (word0) {
2380
+ word0->next = word1->next;
2381
+ } else {
2382
+ pool->setPool(baseIdx, word1->next);
2383
+ }
2384
+ word1 = word1->next;
2385
+ word2->next = NULL;
2386
+ blk->addWord(word2);
2387
+ found = gTrue;
2388
+ newMinBase = word2->base;
2389
+ } else {
2390
+ word0 = word1;
2391
+ word1 = word1->next;
2392
+ }
2393
+ }
2394
+ }
2395
+ minBase = newMinBase;
2396
+
2397
+ // look for words on the line below the current bottom edge of
2398
+ // the block
2399
+ newMaxBase = maxBase;
2400
+ for (baseIdx = pool->getBaseIdx(maxBase);
2401
+ baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
2402
+ ++baseIdx) {
2403
+ word0 = NULL;
2404
+ word1 = pool->getPool(baseIdx);
2405
+ while (word1) {
2406
+ if (word1->base > maxBase &&
2407
+ word1->base <= maxBase + lineSpace &&
2408
+ ((rot == 0 || rot == 2)
2409
+ ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2410
+ : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2411
+ fabs(word1->fontSize - fontSize) <
2412
+ maxBlockFontSizeDelta1 * fontSize) {
2413
+ word2 = word1;
2414
+ if (word0) {
2415
+ word0->next = word1->next;
2416
+ } else {
2417
+ pool->setPool(baseIdx, word1->next);
2418
+ }
2419
+ word1 = word1->next;
2420
+ word2->next = NULL;
2421
+ blk->addWord(word2);
2422
+ found = gTrue;
2423
+ newMaxBase = word2->base;
2424
+ } else {
2425
+ word0 = word1;
2426
+ word1 = word1->next;
2427
+ }
2428
+ }
2429
+ }
2430
+ maxBase = newMaxBase;
2431
+
2432
+ // look for words that are on lines already in the block, and
2433
+ // that overlap the block horizontally
2434
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2435
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2436
+ ++baseIdx) {
2437
+ word0 = NULL;
2438
+ word1 = pool->getPool(baseIdx);
2439
+ while (word1) {
2440
+ if (word1->base >= minBase - intraLineSpace &&
2441
+ word1->base <= maxBase + intraLineSpace &&
2442
+ ((rot == 0 || rot == 2)
2443
+ ? (word1->xMin < blk->xMax + colSpace1 &&
2444
+ word1->xMax > blk->xMin - colSpace1)
2445
+ : (word1->yMin < blk->yMax + colSpace1 &&
2446
+ word1->yMax > blk->yMin - colSpace1)) &&
2447
+ fabs(word1->fontSize - fontSize) <
2448
+ maxBlockFontSizeDelta2 * fontSize) {
2449
+ word2 = word1;
2450
+ if (word0) {
2451
+ word0->next = word1->next;
2452
+ } else {
2453
+ pool->setPool(baseIdx, word1->next);
2454
+ }
2455
+ word1 = word1->next;
2456
+ word2->next = NULL;
2457
+ blk->addWord(word2);
2458
+ found = gTrue;
2459
+ } else {
2460
+ word0 = word1;
2461
+ word1 = word1->next;
2462
+ }
2463
+ }
2464
+ }
2465
+
2466
+ // only check for outlying words (the next two chunks of code)
2467
+ // if we didn't find anything else
2468
+ if (found) {
2469
+ continue;
2470
+ }
2471
+
2472
+ // scan down the left side of the block, looking for words
2473
+ // that are near (but not overlapping) the block; if there are
2474
+ // three or fewer, add them to the block
2475
+ n = 0;
2476
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2477
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2478
+ ++baseIdx) {
2479
+ word1 = pool->getPool(baseIdx);
2480
+ while (word1) {
2481
+ if (word1->base >= minBase - intraLineSpace &&
2482
+ word1->base <= maxBase + intraLineSpace &&
2483
+ ((rot == 0 || rot == 2)
2484
+ ? (word1->xMax <= blk->xMin &&
2485
+ word1->xMax > blk->xMin - colSpace2)
2486
+ : (word1->yMax <= blk->yMin &&
2487
+ word1->yMax > blk->yMin - colSpace2)) &&
2488
+ fabs(word1->fontSize - fontSize) <
2489
+ maxBlockFontSizeDelta3 * fontSize) {
2490
+ ++n;
2491
+ break;
2492
+ }
2493
+ word1 = word1->next;
2494
+ }
2495
+ }
2496
+ if (n > 0 && n <= 3) {
2497
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2498
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2499
+ ++baseIdx) {
2500
+ word0 = NULL;
2501
+ word1 = pool->getPool(baseIdx);
2502
+ while (word1) {
2503
+ if (word1->base >= minBase - intraLineSpace &&
2504
+ word1->base <= maxBase + intraLineSpace &&
2505
+ ((rot == 0 || rot == 2)
2506
+ ? (word1->xMax <= blk->xMin &&
2507
+ word1->xMax > blk->xMin - colSpace2)
2508
+ : (word1->yMax <= blk->yMin &&
2509
+ word1->yMax > blk->yMin - colSpace2)) &&
2510
+ fabs(word1->fontSize - fontSize) <
2511
+ maxBlockFontSizeDelta3 * fontSize) {
2512
+ word2 = word1;
2513
+ if (word0) {
2514
+ word0->next = word1->next;
2515
+ } else {
2516
+ pool->setPool(baseIdx, word1->next);
2517
+ }
2518
+ word1 = word1->next;
2519
+ word2->next = NULL;
2520
+ blk->addWord(word2);
2521
+ if (word2->base < minBase) {
2522
+ minBase = word2->base;
2523
+ } else if (word2->base > maxBase) {
2524
+ maxBase = word2->base;
2525
+ }
2526
+ found = gTrue;
2527
+ break;
2528
+ } else {
2529
+ word0 = word1;
2530
+ word1 = word1->next;
2531
+ }
2532
+ }
2533
+ }
2534
+ }
2535
+
2536
+ // scan down the right side of the block, looking for words
2537
+ // that are near (but not overlapping) the block; if there are
2538
+ // three or fewer, add them to the block
2539
+ n = 0;
2540
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2541
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2542
+ ++baseIdx) {
2543
+ word1 = pool->getPool(baseIdx);
2544
+ while (word1) {
2545
+ if (word1->base >= minBase - intraLineSpace &&
2546
+ word1->base <= maxBase + intraLineSpace &&
2547
+ ((rot == 0 || rot == 2)
2548
+ ? (word1->xMin >= blk->xMax &&
2549
+ word1->xMin < blk->xMax + colSpace2)
2550
+ : (word1->yMin >= blk->yMax &&
2551
+ word1->yMin < blk->yMax + colSpace2)) &&
2552
+ fabs(word1->fontSize - fontSize) <
2553
+ maxBlockFontSizeDelta3 * fontSize) {
2554
+ ++n;
2555
+ break;
2556
+ }
2557
+ word1 = word1->next;
2558
+ }
2559
+ }
2560
+ if (n > 0 && n <= 3) {
2561
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2562
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2563
+ ++baseIdx) {
2564
+ word0 = NULL;
2565
+ word1 = pool->getPool(baseIdx);
2566
+ while (word1) {
2567
+ if (word1->base >= minBase - intraLineSpace &&
2568
+ word1->base <= maxBase + intraLineSpace &&
2569
+ ((rot == 0 || rot == 2)
2570
+ ? (word1->xMin >= blk->xMax &&
2571
+ word1->xMin < blk->xMax + colSpace2)
2572
+ : (word1->yMin >= blk->yMax &&
2573
+ word1->yMin < blk->yMax + colSpace2)) &&
2574
+ fabs(word1->fontSize - fontSize) <
2575
+ maxBlockFontSizeDelta3 * fontSize) {
2576
+ word2 = word1;
2577
+ if (word0) {
2578
+ word0->next = word1->next;
2579
+ } else {
2580
+ pool->setPool(baseIdx, word1->next);
2581
+ }
2582
+ word1 = word1->next;
2583
+ word2->next = NULL;
2584
+ blk->addWord(word2);
2585
+ if (word2->base < minBase) {
2586
+ minBase = word2->base;
2587
+ } else if (word2->base > maxBase) {
2588
+ maxBase = word2->base;
2589
+ }
2590
+ found = gTrue;
2591
+ break;
2592
+ } else {
2593
+ word0 = word1;
2594
+ word1 = word1->next;
2595
+ }
2596
+ }
2597
+ }
2598
+ }
2599
+
2600
+ } while (found);
2601
+
2602
+ //~ need to compute the primary writing mode (horiz/vert) in
2603
+ //~ addition to primary rotation
2604
+
2605
+ // coalesce the block, and add it to the list
2606
+ blk->coalesce(uMap);
2607
+ if (lastBlk) {
2608
+ lastBlk->next = blk;
2609
+ } else {
2610
+ blkList = blk;
2611
+ }
2612
+ lastBlk = blk;
2613
+ count[rot] += blk->charCount;
2614
+ if (primaryRot < 0 || count[rot] > count[primaryRot]) {
2615
+ primaryRot = rot;
2616
+ }
2617
+ ++nBlocks;
2618
+ }
2619
+ }
2620
+
2621
+ #if 0 // for debugging
2622
+ printf("*** rotation ***\n");
2623
+ for (rot = 0; rot < 4; ++rot) {
2624
+ printf(" %d: %6d\n", rot, count[rot]);
2625
+ }
2626
+ printf(" primary rot = %d\n", primaryRot);
2627
+ printf("\n");
2628
+ #endif
2629
+
2630
+ #if 0 // for debugging
2631
+ printf("*** blocks ***\n");
2632
+ for (blk = blkList; blk; blk = blk->next) {
2633
+ printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2634
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
2635
+ for (line = blk->lines; line; line = line->next) {
2636
+ printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2637
+ line->xMin, line->xMax, line->yMin, line->yMax, line->base);
2638
+ for (word0 = line->words; word0; word0 = word0->next) {
2639
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2640
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2641
+ word0->base, word0->fontSize, word0->spaceAfter);
2642
+ for (i = 0; i < word0->len; ++i) {
2643
+ fputc(word0->text[i] & 0xff, stdout);
2644
+ }
2645
+ printf("'\n");
2646
+ }
2647
+ }
2648
+ }
2649
+ printf("\n");
2650
+ #endif
2651
+
2652
+ // determine the primary direction
2653
+ lrCount = 0;
2654
+ for (blk = blkList; blk; blk = blk->next) {
2655
+ for (line = blk->lines; line; line = line->next) {
2656
+ for (word0 = line->words; word0; word0 = word0->next) {
2657
+ for (i = 0; i < word0->len; ++i) {
2658
+ if (unicodeTypeL(word0->text[i])) {
2659
+ ++lrCount;
2660
+ } else if (unicodeTypeR(word0->text[i])) {
2661
+ --lrCount;
2662
+ }
2663
+ }
2664
+ }
2665
+ }
2666
+ }
2667
+ primaryLR = lrCount >= 0;
2668
+
2669
+ #if 0 // for debugging
2670
+ printf("*** direction ***\n");
2671
+ printf("lrCount = %d\n", lrCount);
2672
+ printf("primaryLR = %d\n", primaryLR);
2673
+ #endif
2674
+
2675
+ //----- column assignment
2676
+
2677
+ // sort blocks into xy order for column assignment
2678
+ blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2679
+ for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
2680
+ blocks[i] = blk;
2681
+ }
2682
+ qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
2683
+
2684
+ // column assignment
2685
+ for (i = 0; i < nBlocks; ++i) {
2686
+ blk0 = blocks[i];
2687
+ col1 = 0;
2688
+ for (j = 0; j < i; ++j) {
2689
+ blk1 = blocks[j];
2690
+ col2 = 0; // make gcc happy
2691
+ switch (primaryRot) {
2692
+ case 0:
2693
+ if (blk0->xMin > blk1->xMax) {
2694
+ col2 = blk1->col + blk1->nColumns + 3;
2695
+ } else if (blk1->xMax == blk1->xMin) {
2696
+ col2 = blk1->col;
2697
+ } else {
2698
+ col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
2699
+ (blk1->xMax - blk1->xMin)) *
2700
+ blk1->nColumns);
2701
+ }
2702
+ break;
2703
+ case 1:
2704
+ if (blk0->yMin > blk1->yMax) {
2705
+ col2 = blk1->col + blk1->nColumns + 3;
2706
+ } else if (blk1->yMax == blk1->yMin) {
2707
+ col2 = blk1->col;
2708
+ } else {
2709
+ col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
2710
+ (blk1->yMax - blk1->yMin)) *
2711
+ blk1->nColumns);
2712
+ }
2713
+ break;
2714
+ case 2:
2715
+ if (blk0->xMax < blk1->xMin) {
2716
+ col2 = blk1->col + blk1->nColumns + 3;
2717
+ } else if (blk1->xMin == blk1->xMax) {
2718
+ col2 = blk1->col;
2719
+ } else {
2720
+ col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
2721
+ (blk1->xMin - blk1->xMax)) *
2722
+ blk1->nColumns);
2723
+ }
2724
+ break;
2725
+ case 3:
2726
+ if (blk0->yMax < blk1->yMin) {
2727
+ col2 = blk1->col + blk1->nColumns + 3;
2728
+ } else if (blk1->yMin == blk1->yMax) {
2729
+ col2 = blk1->col;
2730
+ } else {
2731
+ col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
2732
+ (blk1->yMin - blk1->yMax)) *
2733
+ blk1->nColumns);
2734
+ }
2735
+ break;
2736
+ }
2737
+ if (col2 > col1) {
2738
+ col1 = col2;
2739
+ }
2740
+ }
2741
+ blk0->col = col1;
2742
+ for (line = blk0->lines; line; line = line->next) {
2743
+ for (j = 0; j <= line->len; ++j) {
2744
+ line->col[j] += col1;
2745
+ }
2746
+ }
2747
+ }
2748
+
2749
+ #if 0 // for debugging
2750
+ printf("*** blocks, after column assignment ***\n");
2751
+ for (blk = blkList; blk; blk = blk->next) {
2752
+ printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2753
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
2754
+ blk->nColumns);
2755
+ for (line = blk->lines; line; line = line->next) {
2756
+ printf(" line:\n");
2757
+ for (word0 = line->words; word0; word0 = word0->next) {
2758
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2759
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2760
+ word0->base, word0->fontSize, word0->spaceAfter);
2761
+ for (i = 0; i < word0->len; ++i) {
2762
+ fputc(word0->text[i] & 0xff, stdout);
2763
+ }
2764
+ printf("'\n");
2765
+ }
2766
+ }
2767
+ }
2768
+ printf("\n");
2769
+ #endif
2770
+
2771
+ //----- reading order sort
2772
+
2773
+ // sort blocks into yx order (in preparation for reading order sort)
2774
+ qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot);
2775
+
2776
+ // compute space on left and right sides of each block
2777
+ for (i = 0; i < nBlocks; ++i) {
2778
+ blk0 = blocks[i];
2779
+ for (j = 0; j < nBlocks; ++j) {
2780
+ blk1 = blocks[j];
2781
+ if (blk1 != blk0) {
2782
+ blk0->updatePriMinMax(blk1);
2783
+ }
2784
+ }
2785
+ }
2786
+
2787
+ #if 0 // for debugging
2788
+ printf("*** blocks, after yx sort ***\n");
2789
+ for (i = 0; i < nBlocks; ++i) {
2790
+ blk = blocks[i];
2791
+ printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
2792
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2793
+ blk->priMin, blk->priMax);
2794
+ for (line = blk->lines; line; line = line->next) {
2795
+ printf(" line:\n");
2796
+ for (word0 = line->words; word0; word0 = word0->next) {
2797
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2798
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2799
+ word0->base, word0->fontSize, word0->spaceAfter);
2800
+ for (j = 0; j < word0->len; ++j) {
2801
+ fputc(word0->text[j] & 0xff, stdout);
2802
+ }
2803
+ printf("'\n");
2804
+ }
2805
+ }
2806
+ }
2807
+ printf("\n");
2808
+ #endif
2809
+
2810
+ // build the flows
2811
+ //~ this needs to be adjusted for writing mode (vertical text)
2812
+ //~ this also needs to account for right-to-left column ordering
2813
+ blkArray = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2814
+ memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *));
2815
+ flows = lastFlow = NULL;
2816
+ firstBlkIdx = 0;
2817
+ nBlocksLeft = nBlocks;
2818
+ while (nBlocksLeft > 0) {
2819
+
2820
+ // find the upper-left-most block
2821
+ for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ;
2822
+ i = firstBlkIdx;
2823
+ blk = blkArray[i];
2824
+ for (j = firstBlkIdx + 1; j < nBlocks; ++j) {
2825
+ blk1 = blkArray[j];
2826
+ if (blk1) {
2827
+ if (blk && blk->secondaryDelta(blk1) > 0) {
2828
+ break;
2829
+ }
2830
+ if (blk1->primaryCmp(blk) < 0) {
2831
+ i = j;
2832
+ blk = blk1;
2833
+ }
2834
+ }
2835
+ }
2836
+ blkArray[i] = NULL;
2837
+ --nBlocksLeft;
2838
+ blk->next = NULL;
2839
+
2840
+ // create a new flow, starting with the upper-left-most block
2841
+ flow = new TextFlow(this, blk);
2842
+ if (lastFlow) {
2843
+ lastFlow->next = flow;
2844
+ } else {
2845
+ flows = flow;
2846
+ }
2847
+ lastFlow = flow;
2848
+ fontSize = blk->lines->words->fontSize;
2849
+
2850
+ // push the upper-left-most block on the stack
2851
+ blk->stackNext = NULL;
2852
+ blkStack = blk;
2853
+
2854
+ // find the other blocks in this flow
2855
+ while (blkStack) {
2856
+
2857
+ // find the upper-left-most block under (but within
2858
+ // maxBlockSpacing of) the top block on the stack
2859
+ blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize;
2860
+ blk = NULL;
2861
+ i = -1;
2862
+ for (j = firstBlkIdx; j < nBlocks; ++j) {
2863
+ blk1 = blkArray[j];
2864
+ if (blk1) {
2865
+ if (blkStack->secondaryDelta(blk1) > blkSpace) {
2866
+ break;
2867
+ }
2868
+ if (blk && blk->secondaryDelta(blk1) > 0) {
2869
+ break;
2870
+ }
2871
+ if (blk1->isBelow(blkStack) &&
2872
+ (!blk || blk1->primaryCmp(blk) < 0)) {
2873
+ i = j;
2874
+ blk = blk1;
2875
+ }
2876
+ }
2877
+ }
2878
+
2879
+ // if a suitable block was found, add it to the flow and push it
2880
+ // onto the stack
2881
+ if (blk && flow->blockFits(blk, blkStack)) {
2882
+ blkArray[i] = NULL;
2883
+ --nBlocksLeft;
2884
+ blk->next = NULL;
2885
+ flow->addBlock(blk);
2886
+ fontSize = blk->lines->words->fontSize;
2887
+ blk->stackNext = blkStack;
2888
+ blkStack = blk;
2889
+
2890
+ // otherwise (if there is no block under the top block or the
2891
+ // block is not suitable), pop the stack
2892
+ } else {
2893
+ blkStack = blkStack->stackNext;
2894
+ }
2895
+ }
2896
+ }
2897
+ gfree(blkArray);
2898
+
2899
+ #if 0 // for debugging
2900
+ printf("*** flows ***\n");
2901
+ for (flow = flows; flow; flow = flow->next) {
2902
+ printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
2903
+ flow->xMin, flow->xMax, flow->yMin, flow->yMax,
2904
+ flow->priMin, flow->priMax);
2905
+ for (blk = flow->blocks; blk; blk = blk->next) {
2906
+ printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
2907
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2908
+ blk->priMin, blk->priMax);
2909
+ for (line = blk->lines; line; line = line->next) {
2910
+ printf(" line:\n");
2911
+ for (word0 = line->words; word0; word0 = word0->next) {
2912
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2913
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2914
+ word0->base, word0->fontSize, word0->spaceAfter);
2915
+ for (i = 0; i < word0->len; ++i) {
2916
+ fputc(word0->text[i] & 0xff, stdout);
2917
+ }
2918
+ printf("'\n");
2919
+ }
2920
+ }
2921
+ }
2922
+ }
2923
+ printf("\n");
2924
+ #endif
2925
+
2926
+ if (uMap) {
2927
+ uMap->decRefCnt();
2928
+ }
2929
+ }
2930
+
2931
+ GBool TextPage::findText(Unicode *s, int len,
2932
+ GBool startAtTop, GBool stopAtBottom,
2933
+ GBool startAtLast, GBool stopAtLast,
2934
+ GBool caseSensitive, GBool backward,
2935
+ double *xMin, double *yMin,
2936
+ double *xMax, double *yMax) {
2937
+ TextBlock *blk;
2938
+ TextLine *line;
2939
+ Unicode *s2, *txt;
2940
+ Unicode *p;
2941
+ int txtSize, m, i, j, k;
2942
+ double xStart, yStart, xStop, yStop;
2943
+ double xMin0, yMin0, xMax0, yMax0;
2944
+ double xMin1, yMin1, xMax1, yMax1;
2945
+ GBool found;
2946
+
2947
+ //~ needs to handle right-to-left text
2948
+
2949
+ if (rawOrder) {
2950
+ return gFalse;
2951
+ }
2952
+
2953
+ // convert the search string to uppercase
2954
+ if (!caseSensitive) {
2955
+ s2 = (Unicode *)gmallocn(len, sizeof(Unicode));
2956
+ for (i = 0; i < len; ++i) {
2957
+ s2[i] = unicodeToUpper(s[i]);
2958
+ }
2959
+ } else {
2960
+ s2 = s;
2961
+ }
2962
+
2963
+ txt = NULL;
2964
+ txtSize = 0;
2965
+
2966
+ xStart = yStart = xStop = yStop = 0;
2967
+ if (startAtLast && haveLastFind) {
2968
+ xStart = lastFindXMin;
2969
+ yStart = lastFindYMin;
2970
+ } else if (!startAtTop) {
2971
+ xStart = *xMin;
2972
+ yStart = *yMin;
2973
+ }
2974
+ if (stopAtLast && haveLastFind) {
2975
+ xStop = lastFindXMin;
2976
+ yStop = lastFindYMin;
2977
+ } else if (!stopAtBottom) {
2978
+ xStop = *xMax;
2979
+ yStop = *yMax;
2980
+ }
2981
+
2982
+ found = gFalse;
2983
+ xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
2984
+ xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
2985
+
2986
+ for (i = backward ? nBlocks - 1 : 0;
2987
+ backward ? i >= 0 : i < nBlocks;
2988
+ i += backward ? -1 : 1) {
2989
+ blk = blocks[i];
2990
+
2991
+ // check: is the block above the top limit?
2992
+ if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
2993
+ continue;
2994
+ }
2995
+
2996
+ // check: is the block below the bottom limit?
2997
+ if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
2998
+ break;
2999
+ }
3000
+
3001
+ for (line = blk->lines; line; line = line->next) {
3002
+
3003
+ // check: is the line above the top limit?
3004
+ if (!startAtTop &&
3005
+ (backward ? line->yMin > yStart : line->yMin < yStart)) {
3006
+ continue;
3007
+ }
3008
+
3009
+ // check: is the line below the bottom limit?
3010
+ if (!stopAtBottom &&
3011
+ (backward ? line->yMin < yStop : line->yMin > yStop)) {
3012
+ continue;
3013
+ }
3014
+
3015
+ // convert the line to uppercase
3016
+ m = line->len;
3017
+ if (!caseSensitive) {
3018
+ if (m > txtSize) {
3019
+ txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3020
+ txtSize = m;
3021
+ }
3022
+ for (k = 0; k < m; ++k) {
3023
+ txt[k] = unicodeToUpper(line->text[k]);
3024
+ }
3025
+ } else {
3026
+ txt = line->text;
3027
+ }
3028
+
3029
+ // search each position in this line
3030
+ j = backward ? m - len : 0;
3031
+ p = txt + j;
3032
+ while (backward ? j >= 0 : j <= m - len) {
3033
+
3034
+ // compare the strings
3035
+ for (k = 0; k < len; ++k) {
3036
+ if (p[k] != s2[k]) {
3037
+ break;
3038
+ }
3039
+ }
3040
+
3041
+ // found it
3042
+ if (k == len) {
3043
+ switch (line->rot) {
3044
+ case 0:
3045
+ xMin1 = line->edge[j];
3046
+ xMax1 = line->edge[j + len];
3047
+ yMin1 = line->yMin;
3048
+ yMax1 = line->yMax;
3049
+ break;
3050
+ case 1:
3051
+ xMin1 = line->xMin;
3052
+ xMax1 = line->xMax;
3053
+ yMin1 = line->edge[j];
3054
+ yMax1 = line->edge[j + len];
3055
+ break;
3056
+ case 2:
3057
+ xMin1 = line->edge[j + len];
3058
+ xMax1 = line->edge[j];
3059
+ yMin1 = line->yMin;
3060
+ yMax1 = line->yMax;
3061
+ break;
3062
+ case 3:
3063
+ xMin1 = line->xMin;
3064
+ xMax1 = line->xMax;
3065
+ yMin1 = line->edge[j + len];
3066
+ yMax1 = line->edge[j];
3067
+ break;
3068
+ }
3069
+ if (backward) {
3070
+ if ((startAtTop ||
3071
+ yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
3072
+ (stopAtBottom ||
3073
+ yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
3074
+ if (!found ||
3075
+ yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
3076
+ xMin0 = xMin1;
3077
+ xMax0 = xMax1;
3078
+ yMin0 = yMin1;
3079
+ yMax0 = yMax1;
3080
+ found = gTrue;
3081
+ }
3082
+ }
3083
+ } else {
3084
+ if ((startAtTop ||
3085
+ yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
3086
+ (stopAtBottom ||
3087
+ yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
3088
+ if (!found ||
3089
+ yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
3090
+ xMin0 = xMin1;
3091
+ xMax0 = xMax1;
3092
+ yMin0 = yMin1;
3093
+ yMax0 = yMax1;
3094
+ found = gTrue;
3095
+ }
3096
+ }
3097
+ }
3098
+ }
3099
+ if (backward) {
3100
+ --j;
3101
+ --p;
3102
+ } else {
3103
+ ++j;
3104
+ ++p;
3105
+ }
3106
+ }
3107
+ }
3108
+ }
3109
+
3110
+ if (!caseSensitive) {
3111
+ gfree(s2);
3112
+ gfree(txt);
3113
+ }
3114
+
3115
+ if (found) {
3116
+ *xMin = xMin0;
3117
+ *xMax = xMax0;
3118
+ *yMin = yMin0;
3119
+ *yMax = yMax0;
3120
+ lastFindXMin = xMin0;
3121
+ lastFindYMin = yMin0;
3122
+ haveLastFind = gTrue;
3123
+ return gTrue;
3124
+ }
3125
+
3126
+ return gFalse;
3127
+ }
3128
+
3129
+ GString *TextPage::getText(double xMin, double yMin,
3130
+ double xMax, double yMax) {
3131
+ GString *s;
3132
+ UnicodeMap *uMap;
3133
+ GBool isUnicode;
3134
+ TextBlock *blk;
3135
+ TextLine *line;
3136
+ TextLineFrag *frags;
3137
+ int nFrags, fragsSize;
3138
+ TextLineFrag *frag;
3139
+ char space[8], eol[16];
3140
+ int spaceLen, eolLen;
3141
+ int lastRot;
3142
+ double x, y, delta;
3143
+ int col, idx0, idx1, i, j;
3144
+ GBool multiLine, oneRot;
3145
+
3146
+ s = new GString();
3147
+
3148
+ if (rawOrder) {
3149
+ return s;
3150
+ }
3151
+
3152
+ // get the output encoding
3153
+ if (!(uMap = globalParams->getTextEncoding())) {
3154
+ return s;
3155
+ }
3156
+ isUnicode = uMap->isUnicode();
3157
+ spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3158
+ eolLen = 0; // make gcc happy
3159
+ switch (globalParams->getTextEOL()) {
3160
+ case eolUnix:
3161
+ eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3162
+ break;
3163
+ case eolDOS:
3164
+ eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3165
+ eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3166
+ break;
3167
+ case eolMac:
3168
+ eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3169
+ break;
3170
+ }
3171
+
3172
+ //~ writing mode (horiz/vert)
3173
+
3174
+ // collect the line fragments that are in the rectangle
3175
+ fragsSize = 256;
3176
+ frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3177
+ nFrags = 0;
3178
+ lastRot = -1;
3179
+ oneRot = gTrue;
3180
+ for (i = 0; i < nBlocks; ++i) {
3181
+ blk = blocks[i];
3182
+ if (xMin < blk->xMax && blk->xMin < xMax &&
3183
+ yMin < blk->yMax && blk->yMin < yMax) {
3184
+ for (line = blk->lines; line; line = line->next) {
3185
+ if (xMin < line->xMax && line->xMin < xMax &&
3186
+ yMin < line->yMax && line->yMin < yMax) {
3187
+ idx0 = idx1 = -1;
3188
+ switch (line->rot) {
3189
+ case 0:
3190
+ y = 0.5 * (line->yMin + line->yMax);
3191
+ if (yMin < y && y < yMax) {
3192
+ j = 0;
3193
+ while (j < line->len) {
3194
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3195
+ idx0 = j;
3196
+ break;
3197
+ }
3198
+ ++j;
3199
+ }
3200
+ j = line->len - 1;
3201
+ while (j >= 0) {
3202
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3203
+ idx1 = j;
3204
+ break;
3205
+ }
3206
+ --j;
3207
+ }
3208
+ }
3209
+ break;
3210
+ case 1:
3211
+ x = 0.5 * (line->xMin + line->xMax);
3212
+ if (xMin < x && x < xMax) {
3213
+ j = 0;
3214
+ while (j < line->len) {
3215
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3216
+ idx0 = j;
3217
+ break;
3218
+ }
3219
+ ++j;
3220
+ }
3221
+ j = line->len - 1;
3222
+ while (j >= 0) {
3223
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3224
+ idx1 = j;
3225
+ break;
3226
+ }
3227
+ --j;
3228
+ }
3229
+ }
3230
+ break;
3231
+ case 2:
3232
+ y = 0.5 * (line->yMin + line->yMax);
3233
+ if (yMin < y && y < yMax) {
3234
+ j = 0;
3235
+ while (j < line->len) {
3236
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3237
+ idx0 = j;
3238
+ break;
3239
+ }
3240
+ ++j;
3241
+ }
3242
+ j = line->len - 1;
3243
+ while (j >= 0) {
3244
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3245
+ idx1 = j;
3246
+ break;
3247
+ }
3248
+ --j;
3249
+ }
3250
+ }
3251
+ break;
3252
+ case 3:
3253
+ x = 0.5 * (line->xMin + line->xMax);
3254
+ if (xMin < x && x < xMax) {
3255
+ j = 0;
3256
+ while (j < line->len) {
3257
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3258
+ idx0 = j;
3259
+ break;
3260
+ }
3261
+ ++j;
3262
+ }
3263
+ j = line->len - 1;
3264
+ while (j >= 0) {
3265
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3266
+ idx1 = j;
3267
+ break;
3268
+ }
3269
+ --j;
3270
+ }
3271
+ }
3272
+ break;
3273
+ }
3274
+ if (idx0 >= 0 && idx1 >= 0) {
3275
+ if (nFrags == fragsSize) {
3276
+ fragsSize *= 2;
3277
+ frags = (TextLineFrag *)
3278
+ greallocn(frags, fragsSize, sizeof(TextLineFrag));
3279
+ }
3280
+ frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
3281
+ ++nFrags;
3282
+ if (lastRot >= 0 && line->rot != lastRot) {
3283
+ oneRot = gFalse;
3284
+ }
3285
+ lastRot = line->rot;
3286
+ }
3287
+ }
3288
+ }
3289
+ }
3290
+ }
3291
+
3292
+ // sort the fragments and generate the string
3293
+ if (nFrags > 0) {
3294
+
3295
+ for (i = 0; i < nFrags; ++i) {
3296
+ frags[i].computeCoords(oneRot);
3297
+ }
3298
+ assignColumns(frags, nFrags, oneRot);
3299
+
3300
+ // if all lines in the region have the same rotation, use it;
3301
+ // otherwise, use the page's primary rotation
3302
+ if (oneRot) {
3303
+ qsort(frags, nFrags, sizeof(TextLineFrag),
3304
+ &TextLineFrag::cmpYXLineRot);
3305
+ } else {
3306
+ qsort(frags, nFrags, sizeof(TextLineFrag),
3307
+ &TextLineFrag::cmpYXPrimaryRot);
3308
+ }
3309
+ i = 0;
3310
+ while (i < nFrags) {
3311
+ delta = maxIntraLineDelta * frags[i].line->words->fontSize;
3312
+ for (j = i+1;
3313
+ j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
3314
+ ++j) ;
3315
+ qsort(frags + i, j - i, sizeof(TextLineFrag),
3316
+ oneRot ? &TextLineFrag::cmpXYColumnLineRot
3317
+ : &TextLineFrag::cmpXYColumnPrimaryRot);
3318
+ i = j;
3319
+ }
3320
+
3321
+ col = 0;
3322
+ multiLine = gFalse;
3323
+ for (i = 0; i < nFrags; ++i) {
3324
+ frag = &frags[i];
3325
+
3326
+ // insert a return
3327
+ if (frag->col < col ||
3328
+ (i > 0 && fabs(frag->base - frags[i-1].base) >
3329
+ maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
3330
+ s->append(eol, eolLen);
3331
+ col = 0;
3332
+ multiLine = gTrue;
3333
+ }
3334
+
3335
+ // column alignment
3336
+ for (; col < frag->col; ++col) {
3337
+ s->append(space, spaceLen);
3338
+ }
3339
+
3340
+ // get the fragment text
3341
+ col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3342
+ }
3343
+
3344
+ if (multiLine) {
3345
+ s->append(eol, eolLen);
3346
+ }
3347
+ }
3348
+
3349
+ gfree(frags);
3350
+ uMap->decRefCnt();
3351
+
3352
+ return s;
3353
+ }
3354
+
3355
+ GBool TextPage::findCharRange(int pos, int length,
3356
+ double *xMin, double *yMin,
3357
+ double *xMax, double *yMax) {
3358
+ TextBlock *blk;
3359
+ TextLine *line;
3360
+ TextWord *word;
3361
+ double xMin0, xMax0, yMin0, yMax0;
3362
+ double xMin1, xMax1, yMin1, yMax1;
3363
+ GBool first;
3364
+ int i, j0, j1;
3365
+
3366
+ if (rawOrder) {
3367
+ return gFalse;
3368
+ }
3369
+
3370
+ //~ this doesn't correctly handle:
3371
+ //~ - ranges split across multiple lines (the highlighted region
3372
+ //~ is the bounding box of all the parts of the range)
3373
+ //~ - cases where characters don't convert one-to-one into Unicode
3374
+ first = gTrue;
3375
+ xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3376
+ xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3377
+ for (i = 0; i < nBlocks; ++i) {
3378
+ blk = blocks[i];
3379
+ for (line = blk->lines; line; line = line->next) {
3380
+ for (word = line->words; word; word = word->next) {
3381
+ if (pos < word->charPos + word->charLen &&
3382
+ word->charPos < pos + length) {
3383
+ j0 = pos - word->charPos;
3384
+ if (j0 < 0) {
3385
+ j0 = 0;
3386
+ }
3387
+ j1 = pos + length - 1 - word->charPos;
3388
+ if (j1 >= word->len) {
3389
+ j1 = word->len - 1;
3390
+ }
3391
+ switch (line->rot) {
3392
+ case 0:
3393
+ xMin1 = word->edge[j0];
3394
+ xMax1 = word->edge[j1 + 1];
3395
+ yMin1 = word->yMin;
3396
+ yMax1 = word->yMax;
3397
+ break;
3398
+ case 1:
3399
+ xMin1 = word->xMin;
3400
+ xMax1 = word->xMax;
3401
+ yMin1 = word->edge[j0];
3402
+ yMax1 = word->edge[j1 + 1];
3403
+ break;
3404
+ case 2:
3405
+ xMin1 = word->edge[j1 + 1];
3406
+ xMax1 = word->edge[j0];
3407
+ yMin1 = word->yMin;
3408
+ yMax1 = word->yMax;
3409
+ break;
3410
+ case 3:
3411
+ xMin1 = word->xMin;
3412
+ xMax1 = word->xMax;
3413
+ yMin1 = word->edge[j1 + 1];
3414
+ yMax1 = word->edge[j0];
3415
+ break;
3416
+ }
3417
+ if (first || xMin1 < xMin0) {
3418
+ xMin0 = xMin1;
3419
+ }
3420
+ if (first || xMax1 > xMax0) {
3421
+ xMax0 = xMax1;
3422
+ }
3423
+ if (first || yMin1 < yMin0) {
3424
+ yMin0 = yMin1;
3425
+ }
3426
+ if (first || yMax1 > yMax0) {
3427
+ yMax0 = yMax1;
3428
+ }
3429
+ first = gFalse;
3430
+ }
3431
+ }
3432
+ }
3433
+ }
3434
+ if (!first) {
3435
+ *xMin = xMin0;
3436
+ *xMax = xMax0;
3437
+ *yMin = yMin0;
3438
+ *yMax = yMax0;
3439
+ return gTrue;
3440
+ }
3441
+ return gFalse;
3442
+ }
3443
+
3444
+ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
3445
+ GBool physLayout) {
3446
+ UnicodeMap *uMap;
3447
+ TextFlow *flow;
3448
+ TextBlock *blk;
3449
+ TextLine *line;
3450
+ TextLineFrag *frags;
3451
+ TextWord *word;
3452
+ int nFrags, fragsSize;
3453
+ TextLineFrag *frag;
3454
+ char space[8], eol[16], eop[8];
3455
+ int spaceLen, eolLen, eopLen;
3456
+ GBool pageBreaks;
3457
+ GString *s;
3458
+ double delta;
3459
+ int col, i, j, d, n;
3460
+
3461
+ // get the output encoding
3462
+ if (!(uMap = globalParams->getTextEncoding())) {
3463
+ return;
3464
+ }
3465
+ spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3466
+ eolLen = 0; // make gcc happy
3467
+ switch (globalParams->getTextEOL()) {
3468
+ case eolUnix:
3469
+ eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3470
+ break;
3471
+ case eolDOS:
3472
+ eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3473
+ eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3474
+ break;
3475
+ case eolMac:
3476
+ eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3477
+ break;
3478
+ }
3479
+ eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
3480
+ pageBreaks = globalParams->getTextPageBreaks();
3481
+
3482
+ //~ writing mode (horiz/vert)
3483
+
3484
+ // output the page in raw (content stream) order
3485
+ if (rawOrder) {
3486
+
3487
+ for (word = rawWords; word; word = word->next) {
3488
+ s = new GString();
3489
+ dumpFragment(word->text, word->len, uMap, s);
3490
+ (*outputFunc)(outputStream, s->getCString(), s->getLength());
3491
+ delete s;
3492
+ if (word->next &&
3493
+ fabs(word->next->base - word->base) <
3494
+ maxIntraLineDelta * word->fontSize) {
3495
+ if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
3496
+ (*outputFunc)(outputStream, space, spaceLen);
3497
+ }
3498
+ } else {
3499
+ (*outputFunc)(outputStream, eol, eolLen);
3500
+ }
3501
+ }
3502
+
3503
+ // output the page, maintaining the original physical layout
3504
+ } else if (physLayout) {
3505
+
3506
+ // collect the line fragments for the page and sort them
3507
+ fragsSize = 256;
3508
+ frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3509
+ nFrags = 0;
3510
+ for (i = 0; i < nBlocks; ++i) {
3511
+ blk = blocks[i];
3512
+ for (line = blk->lines; line; line = line->next) {
3513
+ if (nFrags == fragsSize) {
3514
+ fragsSize *= 2;
3515
+ frags = (TextLineFrag *)greallocn(frags,
3516
+ fragsSize, sizeof(TextLineFrag));
3517
+ }
3518
+ frags[nFrags].init(line, 0, line->len);
3519
+ frags[nFrags].computeCoords(gTrue);
3520
+ ++nFrags;
3521
+ }
3522
+ }
3523
+ qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
3524
+ i = 0;
3525
+ while (i < nFrags) {
3526
+ delta = maxIntraLineDelta * frags[i].line->words->fontSize;
3527
+ for (j = i+1;
3528
+ j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
3529
+ ++j) ;
3530
+ qsort(frags + i, j - i, sizeof(TextLineFrag),
3531
+ &TextLineFrag::cmpXYColumnPrimaryRot);
3532
+ i = j;
3533
+ }
3534
+
3535
+ #if 0 // for debugging
3536
+ printf("*** line fragments ***\n");
3537
+ for (i = 0; i < nFrags; ++i) {
3538
+ frag = &frags[i];
3539
+ printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
3540
+ frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
3541
+ for (n = 0; n < frag->len; ++n) {
3542
+ fputc(frag->line->text[frag->start + n] & 0xff, stdout);
3543
+ }
3544
+ printf("'\n");
3545
+ }
3546
+ printf("\n");
3547
+ #endif
3548
+
3549
+ // generate output
3550
+ col = 0;
3551
+ for (i = 0; i < nFrags; ++i) {
3552
+ frag = &frags[i];
3553
+
3554
+ // column alignment
3555
+ for (; col < frag->col; ++col) {
3556
+ (*outputFunc)(outputStream, space, spaceLen);
3557
+ }
3558
+
3559
+ // print the line
3560
+ s = new GString();
3561
+ col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3562
+ (*outputFunc)(outputStream, s->getCString(), s->getLength());
3563
+ delete s;
3564
+
3565
+ // print one or more returns if necessary
3566
+ if (i == nFrags - 1 ||
3567
+ frags[i+1].col < col ||
3568
+ fabs(frags[i+1].base - frag->base) >
3569
+ maxIntraLineDelta * frag->line->words->fontSize) {
3570
+ if (i < nFrags - 1) {
3571
+ d = (int)((frags[i+1].base - frag->base) /
3572
+ frag->line->words->fontSize);
3573
+ if (d < 1) {
3574
+ d = 1;
3575
+ } else if (d > 5) {
3576
+ d = 5;
3577
+ }
3578
+ } else {
3579
+ d = 1;
3580
+ }
3581
+ for (; d > 0; --d) {
3582
+ (*outputFunc)(outputStream, eol, eolLen);
3583
+ }
3584
+ col = 0;
3585
+ }
3586
+ }
3587
+
3588
+ gfree(frags);
3589
+
3590
+ // output the page, "undoing" the layout
3591
+ } else {
3592
+ for (flow = flows; flow; flow = flow->next) {
3593
+ for (blk = flow->blocks; blk; blk = blk->next) {
3594
+ for (line = blk->lines; line; line = line->next) {
3595
+ n = line->len;
3596
+ if (line->hyphenated && (line->next || blk->next)) {
3597
+ --n;
3598
+ }
3599
+ s = new GString();
3600
+ dumpFragment(line->text, n, uMap, s);
3601
+ (*outputFunc)(outputStream, s->getCString(), s->getLength());
3602
+ delete s;
3603
+ if (!line->hyphenated) {
3604
+ if (line->next) {
3605
+ (*outputFunc)(outputStream, space, spaceLen);
3606
+ } else if (blk->next) {
3607
+ //~ this is a bit of a kludge - we should really do a more
3608
+ //~ intelligent determination of paragraphs
3609
+ if (blk->next->lines->words->fontSize ==
3610
+ blk->lines->words->fontSize) {
3611
+ (*outputFunc)(outputStream, space, spaceLen);
3612
+ } else {
3613
+ (*outputFunc)(outputStream, eol, eolLen);
3614
+ }
3615
+ }
3616
+ }
3617
+ }
3618
+ }
3619
+ (*outputFunc)(outputStream, eol, eolLen);
3620
+ (*outputFunc)(outputStream, eol, eolLen);
3621
+ }
3622
+ }
3623
+
3624
+ // end of page
3625
+ if (pageBreaks) {
3626
+ (*outputFunc)(outputStream, eop, eopLen);
3627
+ }
3628
+
3629
+ uMap->decRefCnt();
3630
+ }
3631
+
3632
+ void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
3633
+ TextLineFrag *frag0, *frag1;
3634
+ int rot, col1, col2, i, j, k;
3635
+
3636
+ // all text in the region has the same rotation -- recompute the
3637
+ // column numbers based only on the text in the region
3638
+ if (oneRot) {
3639
+ qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
3640
+ rot = frags[0].line->rot;
3641
+ for (i = 0; i < nFrags; ++i) {
3642
+ frag0 = &frags[i];
3643
+ col1 = 0;
3644
+ for (j = 0; j < i; ++j) {
3645
+ frag1 = &frags[j];
3646
+ col2 = 0; // make gcc happy
3647
+ switch (rot) {
3648
+ case 0:
3649
+ if (frag0->xMin >= frag1->xMax) {
3650
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3651
+ frag1->line->col[frag1->start]) + 1;
3652
+ } else {
3653
+ for (k = frag1->start;
3654
+ k < frag1->start + frag1->len &&
3655
+ frag0->xMin >= 0.5 * (frag1->line->edge[k] +
3656
+ frag1->line->edge[k+1]);
3657
+ ++k) ;
3658
+ col2 = frag1->col +
3659
+ frag1->line->col[k] - frag1->line->col[frag1->start];
3660
+ }
3661
+ break;
3662
+ case 1:
3663
+ if (frag0->yMin >= frag1->yMax) {
3664
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3665
+ frag1->line->col[frag1->start]) + 1;
3666
+ } else {
3667
+ for (k = frag1->start;
3668
+ k < frag1->start + frag1->len &&
3669
+ frag0->yMin >= 0.5 * (frag1->line->edge[k] +
3670
+ frag1->line->edge[k+1]);
3671
+ ++k) ;
3672
+ col2 = frag1->col +
3673
+ frag1->line->col[k] - frag1->line->col[frag1->start];
3674
+ }
3675
+ break;
3676
+ case 2:
3677
+ if (frag0->xMax <= frag1->xMin) {
3678
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3679
+ frag1->line->col[frag1->start]) + 1;
3680
+ } else {
3681
+ for (k = frag1->start;
3682
+ k < frag1->start + frag1->len &&
3683
+ frag0->xMax <= 0.5 * (frag1->line->edge[k] +
3684
+ frag1->line->edge[k+1]);
3685
+ ++k) ;
3686
+ col2 = frag1->col +
3687
+ frag1->line->col[k] - frag1->line->col[frag1->start];
3688
+ }
3689
+ break;
3690
+ case 3:
3691
+ if (frag0->yMax <= frag1->yMin) {
3692
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3693
+ frag1->line->col[frag1->start]) + 1;
3694
+ } else {
3695
+ for (k = frag1->start;
3696
+ k < frag1->start + frag1->len &&
3697
+ frag0->yMax <= 0.5 * (frag1->line->edge[k] +
3698
+ frag1->line->edge[k+1]);
3699
+ ++k) ;
3700
+ col2 = frag1->col +
3701
+ frag1->line->col[k] - frag1->line->col[frag1->start];
3702
+ }
3703
+ break;
3704
+ }
3705
+ if (col2 > col1) {
3706
+ col1 = col2;
3707
+ }
3708
+ }
3709
+ frag0->col = col1;
3710
+ }
3711
+
3712
+ // the region includes text at different rotations -- use the
3713
+ // globally assigned column numbers, offset by the minimum column
3714
+ // number (i.e., shift everything over to column 0)
3715
+ } else {
3716
+ col1 = frags[0].col;
3717
+ for (i = 1; i < nFrags; ++i) {
3718
+ if (frags[i].col < col1) {
3719
+ col1 = frags[i].col;
3720
+ }
3721
+ }
3722
+ for (i = 0; i < nFrags; ++i) {
3723
+ frags[i].col -= col1;
3724
+ }
3725
+ }
3726
+ }
3727
+
3728
+ int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
3729
+ GString *s) {
3730
+ char lre[8], rle[8], popdf[8], buf[8];
3731
+ int lreLen, rleLen, popdfLen, n;
3732
+ int nCols, i, j, k;
3733
+
3734
+ nCols = 0;
3735
+
3736
+ if (uMap->isUnicode()) {
3737
+
3738
+ lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
3739
+ rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
3740
+ popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
3741
+
3742
+ if (primaryLR) {
3743
+
3744
+ i = 0;
3745
+ while (i < len) {
3746
+ // output a left-to-right section
3747
+ for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
3748
+ for (k = i; k < j; ++k) {
3749
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3750
+ s->append(buf, n);
3751
+ ++nCols;
3752
+ }
3753
+ i = j;
3754
+ // output a right-to-left section
3755
+ for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
3756
+ if (j > i) {
3757
+ s->append(rle, rleLen);
3758
+ for (k = j - 1; k >= i; --k) {
3759
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3760
+ s->append(buf, n);
3761
+ ++nCols;
3762
+ }
3763
+ s->append(popdf, popdfLen);
3764
+ i = j;
3765
+ }
3766
+ }
3767
+
3768
+ } else {
3769
+
3770
+ s->append(rle, rleLen);
3771
+ i = len - 1;
3772
+ while (i >= 0) {
3773
+ // output a right-to-left section
3774
+ for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
3775
+ for (k = i; k > j; --k) {
3776
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3777
+ s->append(buf, n);
3778
+ ++nCols;
3779
+ }
3780
+ i = j;
3781
+ // output a left-to-right section
3782
+ for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
3783
+ if (j < i) {
3784
+ s->append(lre, lreLen);
3785
+ for (k = j + 1; k <= i; ++k) {
3786
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3787
+ s->append(buf, n);
3788
+ ++nCols;
3789
+ }
3790
+ s->append(popdf, popdfLen);
3791
+ i = j;
3792
+ }
3793
+ }
3794
+ s->append(popdf, popdfLen);
3795
+
3796
+ }
3797
+
3798
+ } else {
3799
+ for (i = 0; i < len; ++i) {
3800
+ n = uMap->mapUnicode(text[i], buf, sizeof(buf));
3801
+ s->append(buf, n);
3802
+ nCols += n;
3803
+ }
3804
+ }
3805
+
3806
+ return nCols;
3807
+ }
3808
+
3809
+ #if TEXTOUT_WORD_LIST
3810
+ TextWordList *TextPage::makeWordList(GBool physLayout) {
3811
+ return new TextWordList(this, physLayout);
3812
+ }
3813
+ #endif
3814
+
3815
+ //------------------------------------------------------------------------
3816
+ // TextOutputDev
3817
+ //------------------------------------------------------------------------
3818
+
3819
+ static void outputToFile(void *stream, char *text, int len) {
3820
+ fwrite(text, 1, len, (FILE *)stream);
3821
+ }
3822
+
3823
+ TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
3824
+ GBool rawOrderA, GBool append) {
3825
+ text = NULL;
3826
+ physLayout = physLayoutA;
3827
+ rawOrder = rawOrderA;
3828
+ doHTML = gFalse;
3829
+ ok = gTrue;
3830
+
3831
+ // open file
3832
+ needClose = gFalse;
3833
+ if (fileName) {
3834
+ if (!strcmp(fileName, "-")) {
3835
+ outputStream = stdout;
3836
+ #ifdef WIN32
3837
+ // keep DOS from munging the end-of-line characters
3838
+ setmode(fileno(stdout), O_BINARY);
3839
+ #endif
3840
+ } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
3841
+ needClose = gTrue;
3842
+ } else {
3843
+ error(-1, "Couldn't open text file '%s'", fileName);
3844
+ ok = gFalse;
3845
+ return;
3846
+ }
3847
+ outputFunc = &outputToFile;
3848
+ } else {
3849
+ outputStream = NULL;
3850
+ }
3851
+
3852
+ // set up text object
3853
+ text = new TextPage(rawOrderA);
3854
+ }
3855
+
3856
+ TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
3857
+ GBool physLayoutA, GBool rawOrderA) {
3858
+ outputFunc = func;
3859
+ outputStream = stream;
3860
+ needClose = gFalse;
3861
+ physLayout = physLayoutA;
3862
+ rawOrder = rawOrderA;
3863
+ doHTML = gFalse;
3864
+ text = new TextPage(rawOrderA);
3865
+ ok = gTrue;
3866
+ }
3867
+
3868
+ TextOutputDev::~TextOutputDev() {
3869
+ if (needClose) {
3870
+ #ifdef MACOS
3871
+ ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
3872
+ #endif
3873
+ fclose((FILE *)outputStream);
3874
+ }
3875
+ if (text) {
3876
+ delete text;
3877
+ }
3878
+ }
3879
+
3880
+ void TextOutputDev::startPage(int pageNum, GfxState *state) {
3881
+ text->startPage(state);
3882
+ }
3883
+
3884
+ void TextOutputDev::endPage() {
3885
+ text->endPage();
3886
+ text->coalesce(physLayout, doHTML);
3887
+ if (outputStream) {
3888
+ text->dump(outputStream, outputFunc, physLayout);
3889
+ }
3890
+ }
3891
+
3892
+ void TextOutputDev::updateFont(GfxState *state) {
3893
+ text->updateFont(state);
3894
+ }
3895
+
3896
+ void TextOutputDev::beginString(GfxState *state, GString *s) {
3897
+ }
3898
+
3899
+ void TextOutputDev::endString(GfxState *state) {
3900
+ }
3901
+
3902
+ void TextOutputDev::drawChar(GfxState *state, double x, double y,
3903
+ double dx, double dy,
3904
+ double originX, double originY,
3905
+ CharCode c, int nBytes, Unicode *u, int uLen) {
3906
+ text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
3907
+ }
3908
+
3909
+ void TextOutputDev::stroke(GfxState *state) {
3910
+ GfxPath *path;
3911
+ GfxSubpath *subpath;
3912
+ double x[2], y[2];
3913
+
3914
+ if (!doHTML) {
3915
+ return;
3916
+ }
3917
+ path = state->getPath();
3918
+ if (path->getNumSubpaths() != 1) {
3919
+ return;
3920
+ }
3921
+ subpath = path->getSubpath(0);
3922
+ if (subpath->getNumPoints() != 2) {
3923
+ return;
3924
+ }
3925
+ state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
3926
+ state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
3927
+
3928
+ // look for a vertical or horizontal line
3929
+ if (x[0] == x[1] || y[0] == y[1]) {
3930
+ text->addUnderline(x[0], y[0], x[1], y[1]);
3931
+ }
3932
+ }
3933
+
3934
+ void TextOutputDev::fill(GfxState *state) {
3935
+ GfxPath *path;
3936
+ GfxSubpath *subpath;
3937
+ double x[5], y[5];
3938
+ double rx0, ry0, rx1, ry1, t;
3939
+ int i;
3940
+
3941
+ if (!doHTML) {
3942
+ return;
3943
+ }
3944
+ path = state->getPath();
3945
+ if (path->getNumSubpaths() != 1) {
3946
+ return;
3947
+ }
3948
+ subpath = path->getSubpath(0);
3949
+ if (subpath->getNumPoints() != 5) {
3950
+ return;
3951
+ }
3952
+ for (i = 0; i < 5; ++i) {
3953
+ if (subpath->getCurve(i)) {
3954
+ return;
3955
+ }
3956
+ state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
3957
+ }
3958
+
3959
+ // look for a rectangle
3960
+ if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] &&
3961
+ x[0] == x[4] && y[0] == y[4]) {
3962
+ rx0 = x[0];
3963
+ ry0 = y[0];
3964
+ rx1 = x[2];
3965
+ ry1 = y[1];
3966
+ } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] &&
3967
+ x[0] == x[4] && y[0] == y[4]) {
3968
+ rx0 = x[0];
3969
+ ry0 = y[0];
3970
+ rx1 = x[1];
3971
+ ry1 = y[2];
3972
+ } else {
3973
+ return;
3974
+ }
3975
+ if (rx1 < rx0) {
3976
+ t = rx0;
3977
+ rx0 = rx1;
3978
+ rx1 = t;
3979
+ }
3980
+ if (ry1 < ry0) {
3981
+ t = ry0;
3982
+ ry0 = ry1;
3983
+ ry1 = t;
3984
+ }
3985
+
3986
+ // skinny horizontal rectangle
3987
+ if (ry1 - ry0 < rx1 - rx0) {
3988
+ if (ry1 - ry0 < maxUnderlineWidth) {
3989
+ ry0 = 0.5 * (ry0 + ry1);
3990
+ text->addUnderline(rx0, ry0, rx1, ry0);
3991
+ }
3992
+
3993
+ // skinny vertical rectangle
3994
+ } else {
3995
+ if (rx1 - rx0 < maxUnderlineWidth) {
3996
+ rx0 = 0.5 * (rx0 + rx1);
3997
+ text->addUnderline(rx0, ry0, rx0, ry1);
3998
+ }
3999
+ }
4000
+ }
4001
+
4002
+ void TextOutputDev::eoFill(GfxState *state) {
4003
+ if (!doHTML) {
4004
+ return;
4005
+ }
4006
+ fill(state);
4007
+ }
4008
+
4009
+ void TextOutputDev::processLink(Link *link, Catalog *catalog) {
4010
+ double x1, y1, x2, y2;
4011
+ int xMin, yMin, xMax, yMax, x, y;
4012
+
4013
+ if (!doHTML) {
4014
+ return;
4015
+ }
4016
+ link->getRect(&x1, &y1, &x2, &y2);
4017
+ cvtUserToDev(x1, y1, &x, &y);
4018
+ xMin = xMax = x;
4019
+ yMin = yMax = y;
4020
+ cvtUserToDev(x1, y2, &x, &y);
4021
+ if (x < xMin) {
4022
+ xMin = x;
4023
+ } else if (x > xMax) {
4024
+ xMax = x;
4025
+ }
4026
+ if (y < yMin) {
4027
+ yMin = y;
4028
+ } else if (y > yMax) {
4029
+ yMax = y;
4030
+ }
4031
+ cvtUserToDev(x2, y1, &x, &y);
4032
+ if (x < xMin) {
4033
+ xMin = x;
4034
+ } else if (x > xMax) {
4035
+ xMax = x;
4036
+ }
4037
+ if (y < yMin) {
4038
+ yMin = y;
4039
+ } else if (y > yMax) {
4040
+ yMax = y;
4041
+ }
4042
+ cvtUserToDev(x2, y2, &x, &y);
4043
+ if (x < xMin) {
4044
+ xMin = x;
4045
+ } else if (x > xMax) {
4046
+ xMax = x;
4047
+ }
4048
+ if (y < yMin) {
4049
+ yMin = y;
4050
+ } else if (y > yMax) {
4051
+ yMax = y;
4052
+ }
4053
+ text->addLink(xMin, yMin, xMax, yMax, link);
4054
+ }
4055
+
4056
+ GBool TextOutputDev::findText(Unicode *s, int len,
4057
+ GBool startAtTop, GBool stopAtBottom,
4058
+ GBool startAtLast, GBool stopAtLast,
4059
+ GBool caseSensitive, GBool backward,
4060
+ double *xMin, double *yMin,
4061
+ double *xMax, double *yMax) {
4062
+ return text->findText(s, len, startAtTop, stopAtBottom,
4063
+ startAtLast, stopAtLast, caseSensitive, backward,
4064
+ xMin, yMin, xMax, yMax);
4065
+ }
4066
+
4067
+ GString *TextOutputDev::getText(double xMin, double yMin,
4068
+ double xMax, double yMax) {
4069
+ return text->getText(xMin, yMin, xMax, yMax);
4070
+ }
4071
+
4072
+ GBool TextOutputDev::findCharRange(int pos, int length,
4073
+ double *xMin, double *yMin,
4074
+ double *xMax, double *yMax) {
4075
+ return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
4076
+ }
4077
+
4078
+ #if TEXTOUT_WORD_LIST
4079
+ TextWordList *TextOutputDev::makeWordList() {
4080
+ return text->makeWordList(physLayout);
4081
+ }
4082
+ #endif
4083
+
4084
+ TextPage *TextOutputDev::takeText() {
4085
+ TextPage *ret;
4086
+
4087
+ ret = text;
4088
+ text = new TextPage(rawOrder);
4089
+ return ret;
4090
+ }