pdf2json 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (473) hide show
  1. data/README.markdown +9 -0
  2. data/bin/.gitkeep +0 -0
  3. data/ext/extconf.rb +30 -0
  4. data/lib/pdf2json.rb +8 -0
  5. data/pdf2json-0.52-source/AUTHORS +24 -0
  6. data/pdf2json-0.52-source/CHANGES +11 -0
  7. data/pdf2json-0.52-source/Makefile +84 -0
  8. data/pdf2json-0.52-source/Makefile.in +84 -0
  9. data/pdf2json-0.52-source/aclocal.m4 +274 -0
  10. data/pdf2json-0.52-source/aconf-win32.h +86 -0
  11. data/pdf2json-0.52-source/aconf.h +42 -0
  12. data/pdf2json-0.52-source/aconf.h.in +41 -0
  13. data/pdf2json-0.52-source/autom4te.cache/output.0 +6908 -0
  14. data/pdf2json-0.52-source/autom4te.cache/requests +76 -0
  15. data/pdf2json-0.52-source/autom4te.cache/traces.0 +466 -0
  16. data/pdf2json-0.52-source/config.log +1259 -0
  17. data/pdf2json-0.52-source/config.status +1050 -0
  18. data/pdf2json-0.52-source/configure +6908 -0
  19. data/pdf2json-0.52-source/configure.ac +93 -0
  20. data/pdf2json-0.52-source/doc/pdffonts.1 +130 -0
  21. data/pdf2json-0.52-source/doc/pdffonts.cat +107 -0
  22. data/pdf2json-0.52-source/doc/pdffonts.hlp +117 -0
  23. data/pdf2json-0.52-source/doc/pdfimages.1 +102 -0
  24. data/pdf2json-0.52-source/doc/pdfimages.cat +92 -0
  25. data/pdf2json-0.52-source/doc/pdfimages.hlp +101 -0
  26. data/pdf2json-0.52-source/doc/pdfinfo.1 +158 -0
  27. data/pdf2json-0.52-source/doc/pdfinfo.cat +119 -0
  28. data/pdf2json-0.52-source/doc/pdfinfo.hlp +129 -0
  29. data/pdf2json-0.52-source/doc/pdftoppm.1 +115 -0
  30. data/pdf2json-0.52-source/doc/pdftoppm.cat +105 -0
  31. data/pdf2json-0.52-source/doc/pdftoppm.hlp +114 -0
  32. data/pdf2json-0.52-source/doc/pdftops.1 +229 -0
  33. data/pdf2json-0.52-source/doc/pdftops.cat +221 -0
  34. data/pdf2json-0.52-source/doc/pdftops.hlp +231 -0
  35. data/pdf2json-0.52-source/doc/pdftotext.1 +137 -0
  36. data/pdf2json-0.52-source/doc/pdftotext.cat +120 -0
  37. data/pdf2json-0.52-source/doc/pdftotext.hlp +133 -0
  38. data/pdf2json-0.52-source/doc/sample-xpdfrc +91 -0
  39. data/pdf2json-0.52-source/doc/xpdf.1 +513 -0
  40. data/pdf2json-0.52-source/doc/xpdf.cat +476 -0
  41. data/pdf2json-0.52-source/doc/xpdf.hlp +489 -0
  42. data/pdf2json-0.52-source/doc/xpdfrc.5 +480 -0
  43. data/pdf2json-0.52-source/doc/xpdfrc.cat +474 -0
  44. data/pdf2json-0.52-source/doc/xpdfrc.hlp +479 -0
  45. data/pdf2json-0.52-source/fofi/.DS_Store +0 -0
  46. data/pdf2json-0.52-source/fofi/FoFiBase.cc +156 -0
  47. data/pdf2json-0.52-source/fofi/FoFiBase.h +57 -0
  48. data/pdf2json-0.52-source/fofi/FoFiBase.o +0 -0
  49. data/pdf2json-0.52-source/fofi/FoFiEncodings.cc +994 -0
  50. data/pdf2json-0.52-source/fofi/FoFiEncodings.h +36 -0
  51. data/pdf2json-0.52-source/fofi/FoFiEncodings.o +0 -0
  52. data/pdf2json-0.52-source/fofi/FoFiTrueType.cc +2027 -0
  53. data/pdf2json-0.52-source/fofi/FoFiTrueType.h +174 -0
  54. data/pdf2json-0.52-source/fofi/FoFiTrueType.o +0 -0
  55. data/pdf2json-0.52-source/fofi/FoFiType1.cc +252 -0
  56. data/pdf2json-0.52-source/fofi/FoFiType1.h +59 -0
  57. data/pdf2json-0.52-source/fofi/FoFiType1.o +0 -0
  58. data/pdf2json-0.52-source/fofi/FoFiType1C.cc +2603 -0
  59. data/pdf2json-0.52-source/fofi/FoFiType1C.h +233 -0
  60. data/pdf2json-0.52-source/fofi/FoFiType1C.o +0 -0
  61. data/pdf2json-0.52-source/fofi/Makefile +70 -0
  62. data/pdf2json-0.52-source/fofi/Makefile.dep +0 -0
  63. data/pdf2json-0.52-source/fofi/Makefile.in +70 -0
  64. data/pdf2json-0.52-source/fofi/libfofi.a +0 -0
  65. data/pdf2json-0.52-source/fofi/vms_make.com +0 -0
  66. data/pdf2json-0.52-source/freetype.win32/.DS_Store +0 -0
  67. data/pdf2json-0.52-source/freetype.win32/include/.DS_Store +0 -0
  68. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftconfig.h +528 -0
  69. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftheader.h +780 -0
  70. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftmodule.h +32 -0
  71. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftoption.h +733 -0
  72. data/pdf2json-0.52-source/freetype.win32/include/freetype/config/ftstdlib.h +173 -0
  73. data/pdf2json-0.52-source/freetype.win32/include/freetype/freetype.h +3919 -0
  74. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftadvanc.h +179 -0
  75. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftbbox.h +94 -0
  76. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftbdf.h +209 -0
  77. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftbitmap.h +227 -0
  78. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftcache.h +1128 -0
  79. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftchapters.h +103 -0
  80. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftcid.h +166 -0
  81. data/pdf2json-0.52-source/freetype.win32/include/freetype/fterrdef.h +244 -0
  82. data/pdf2json-0.52-source/freetype.win32/include/freetype/fterrors.h +206 -0
  83. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftgasp.h +120 -0
  84. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftglyph.h +613 -0
  85. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftgxval.h +358 -0
  86. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftgzip.h +102 -0
  87. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftimage.h +1313 -0
  88. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftincrem.h +353 -0
  89. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftlcdfil.h +213 -0
  90. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftlist.h +277 -0
  91. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftlzw.h +99 -0
  92. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftmac.h +274 -0
  93. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftmm.h +378 -0
  94. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftmodapi.h +483 -0
  95. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftmoderr.h +155 -0
  96. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftotval.h +203 -0
  97. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftoutln.h +537 -0
  98. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftpfr.h +172 -0
  99. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftrender.h +230 -0
  100. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftsizes.h +159 -0
  101. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftsnames.h +200 -0
  102. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftstroke.h +716 -0
  103. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftsynth.h +80 -0
  104. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftsystem.h +347 -0
  105. data/pdf2json-0.52-source/freetype.win32/include/freetype/fttrigon.h +350 -0
  106. data/pdf2json-0.52-source/freetype.win32/include/freetype/fttypes.h +588 -0
  107. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftwinfnt.h +274 -0
  108. data/pdf2json-0.52-source/freetype.win32/include/freetype/ftxf86.h +83 -0
  109. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/autohint.h +231 -0
  110. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftcalc.h +179 -0
  111. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftdebug.h +250 -0
  112. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftdriver.h +422 -0
  113. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftgloadr.h +168 -0
  114. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftmemory.h +380 -0
  115. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftobjs.h +1428 -0
  116. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftpic.h +67 -0
  117. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftrfork.h +196 -0
  118. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftserv.h +620 -0
  119. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftstream.h +539 -0
  120. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/fttrace.h +139 -0
  121. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/ftvalid.h +150 -0
  122. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/internal.h +51 -0
  123. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/pcftypes.h +56 -0
  124. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/psaux.h +873 -0
  125. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/pshints.h +712 -0
  126. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svbdf.h +77 -0
  127. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svcid.h +83 -0
  128. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svgldict.h +82 -0
  129. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svgxval.h +72 -0
  130. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svkern.h +51 -0
  131. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svmm.h +104 -0
  132. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svotval.h +55 -0
  133. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svpfr.h +66 -0
  134. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svpostnm.h +79 -0
  135. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svpscmap.h +164 -0
  136. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svpsinfo.h +92 -0
  137. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svsfnt.h +102 -0
  138. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svttcmap.h +106 -0
  139. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svtteng.h +53 -0
  140. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svttglyf.h +67 -0
  141. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svwinfnt.h +50 -0
  142. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/services/svxf86nm.h +55 -0
  143. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/sfnt.h +897 -0
  144. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/t1types.h +270 -0
  145. data/pdf2json-0.52-source/freetype.win32/include/freetype/internal/tttypes.h +1543 -0
  146. data/pdf2json-0.52-source/freetype.win32/include/freetype/t1tables.h +504 -0
  147. data/pdf2json-0.52-source/freetype.win32/include/freetype/ttnameid.h +1247 -0
  148. data/pdf2json-0.52-source/freetype.win32/include/freetype/tttables.h +759 -0
  149. data/pdf2json-0.52-source/freetype.win32/include/freetype/tttags.h +107 -0
  150. data/pdf2json-0.52-source/freetype.win32/include/freetype/ttunpat.h +59 -0
  151. data/pdf2json-0.52-source/freetype.win32/include/ft2build.h +39 -0
  152. data/pdf2json-0.52-source/freetype.win32/lib/freetype_a.lib +0 -0
  153. data/pdf2json-0.52-source/goo/.DS_Store +0 -0
  154. data/pdf2json-0.52-source/goo/FixedPoint.cc +118 -0
  155. data/pdf2json-0.52-source/goo/FixedPoint.h +155 -0
  156. data/pdf2json-0.52-source/goo/FixedPoint.o +0 -0
  157. data/pdf2json-0.52-source/goo/GHash.cc +380 -0
  158. data/pdf2json-0.52-source/goo/GHash.h +78 -0
  159. data/pdf2json-0.52-source/goo/GHash.o +0 -0
  160. data/pdf2json-0.52-source/goo/GList.cc +97 -0
  161. data/pdf2json-0.52-source/goo/GList.h +96 -0
  162. data/pdf2json-0.52-source/goo/GList.o +0 -0
  163. data/pdf2json-0.52-source/goo/GMutex.h +49 -0
  164. data/pdf2json-0.52-source/goo/GString.cc +724 -0
  165. data/pdf2json-0.52-source/goo/GString.cc.fixed +718 -0
  166. data/pdf2json-0.52-source/goo/GString.h +136 -0
  167. data/pdf2json-0.52-source/goo/GString.o +0 -0
  168. data/pdf2json-0.52-source/goo/ImgWriter.o +0 -0
  169. data/pdf2json-0.52-source/goo/JpegWriter.o +0 -0
  170. data/pdf2json-0.52-source/goo/Makefile +72 -0
  171. data/pdf2json-0.52-source/goo/Makefile.dep +0 -0
  172. data/pdf2json-0.52-source/goo/Makefile.in +72 -0
  173. data/pdf2json-0.52-source/goo/PNGWriter.o +0 -0
  174. data/pdf2json-0.52-source/goo/gfile.cc +731 -0
  175. data/pdf2json-0.52-source/goo/gfile.h +138 -0
  176. data/pdf2json-0.52-source/goo/gfile.o +0 -0
  177. data/pdf2json-0.52-source/goo/gmem.cc +264 -0
  178. data/pdf2json-0.52-source/goo/gmem.h +79 -0
  179. data/pdf2json-0.52-source/goo/gmem.o +0 -0
  180. data/pdf2json-0.52-source/goo/gmempp.cc +32 -0
  181. data/pdf2json-0.52-source/goo/gmempp.o +0 -0
  182. data/pdf2json-0.52-source/goo/gtypes.h +29 -0
  183. data/pdf2json-0.52-source/goo/libGoo.a +0 -0
  184. data/pdf2json-0.52-source/goo/parseargs.c +190 -0
  185. data/pdf2json-0.52-source/goo/parseargs.h +71 -0
  186. data/pdf2json-0.52-source/goo/parseargs.o +0 -0
  187. data/pdf2json-0.52-source/goo/vms_directory.c +214 -0
  188. data/pdf2json-0.52-source/goo/vms_dirent.h +67 -0
  189. data/pdf2json-0.52-source/goo/vms_make.com +82 -0
  190. data/pdf2json-0.52-source/goo/vms_sys_dirent.h +54 -0
  191. data/pdf2json-0.52-source/goo/vms_unix_time.h +102 -0
  192. data/pdf2json-0.52-source/goo/vms_unix_times.c +42 -0
  193. data/pdf2json-0.52-source/goo/vms_unlink.c +22 -0
  194. data/pdf2json-0.52-source/ms_make.bat +199 -0
  195. data/pdf2json-0.52-source/splash/.DS_Store +0 -0
  196. data/pdf2json-0.52-source/splash/Makefile +103 -0
  197. data/pdf2json-0.52-source/splash/Makefile.dep +0 -0
  198. data/pdf2json-0.52-source/splash/Makefile.in +103 -0
  199. data/pdf2json-0.52-source/splash/Splash.cc +3310 -0
  200. data/pdf2json-0.52-source/splash/Splash.h +293 -0
  201. data/pdf2json-0.52-source/splash/Splash.o +0 -0
  202. data/pdf2json-0.52-source/splash/SplashBitmap.cc +188 -0
  203. data/pdf2json-0.52-source/splash/SplashBitmap.h +64 -0
  204. data/pdf2json-0.52-source/splash/SplashBitmap.o +0 -0
  205. data/pdf2json-0.52-source/splash/SplashClip.cc +382 -0
  206. data/pdf2json-0.52-source/splash/SplashClip.h +107 -0
  207. data/pdf2json-0.52-source/splash/SplashClip.o +0 -0
  208. data/pdf2json-0.52-source/splash/SplashErrorCodes.h +32 -0
  209. data/pdf2json-0.52-source/splash/SplashFTFont.cc +357 -0
  210. data/pdf2json-0.52-source/splash/SplashFTFont.h +58 -0
  211. data/pdf2json-0.52-source/splash/SplashFTFont.o +0 -0
  212. data/pdf2json-0.52-source/splash/SplashFTFontEngine.cc +179 -0
  213. data/pdf2json-0.52-source/splash/SplashFTFontEngine.h +65 -0
  214. data/pdf2json-0.52-source/splash/SplashFTFontEngine.o +0 -0
  215. data/pdf2json-0.52-source/splash/SplashFTFontFile.cc +114 -0
  216. data/pdf2json-0.52-source/splash/SplashFTFontFile.h +73 -0
  217. data/pdf2json-0.52-source/splash/SplashFTFontFile.o +0 -0
  218. data/pdf2json-0.52-source/splash/SplashFont.cc +176 -0
  219. data/pdf2json-0.52-source/splash/SplashFont.h +104 -0
  220. data/pdf2json-0.52-source/splash/SplashFont.o +0 -0
  221. data/pdf2json-0.52-source/splash/SplashFontEngine.cc +317 -0
  222. data/pdf2json-0.52-source/splash/SplashFontEngine.h +91 -0
  223. data/pdf2json-0.52-source/splash/SplashFontEngine.o +0 -0
  224. data/pdf2json-0.52-source/splash/SplashFontFile.cc +55 -0
  225. data/pdf2json-0.52-source/splash/SplashFontFile.h +60 -0
  226. data/pdf2json-0.52-source/splash/SplashFontFile.o +0 -0
  227. data/pdf2json-0.52-source/splash/SplashFontFileID.cc +23 -0
  228. data/pdf2json-0.52-source/splash/SplashFontFileID.h +30 -0
  229. data/pdf2json-0.52-source/splash/SplashFontFileID.o +0 -0
  230. data/pdf2json-0.52-source/splash/SplashGlyphBitmap.h +26 -0
  231. data/pdf2json-0.52-source/splash/SplashMath.h +89 -0
  232. data/pdf2json-0.52-source/splash/SplashPath.cc +184 -0
  233. data/pdf2json-0.52-source/splash/SplashPath.h +121 -0
  234. data/pdf2json-0.52-source/splash/SplashPath.o +0 -0
  235. data/pdf2json-0.52-source/splash/SplashPattern.cc +40 -0
  236. data/pdf2json-0.52-source/splash/SplashPattern.h +65 -0
  237. data/pdf2json-0.52-source/splash/SplashPattern.o +0 -0
  238. data/pdf2json-0.52-source/splash/SplashScreen.cc +383 -0
  239. data/pdf2json-0.52-source/splash/SplashScreen.h +56 -0
  240. data/pdf2json-0.52-source/splash/SplashScreen.o +0 -0
  241. data/pdf2json-0.52-source/splash/SplashState.cc +165 -0
  242. data/pdf2json-0.52-source/splash/SplashState.h +103 -0
  243. data/pdf2json-0.52-source/splash/SplashState.o +0 -0
  244. data/pdf2json-0.52-source/splash/SplashT1Font.cc +287 -0
  245. data/pdf2json-0.52-source/splash/SplashT1Font.h +57 -0
  246. data/pdf2json-0.52-source/splash/SplashT1Font.o +0 -0
  247. data/pdf2json-0.52-source/splash/SplashT1FontEngine.cc +124 -0
  248. data/pdf2json-0.52-source/splash/SplashT1FontEngine.h +53 -0
  249. data/pdf2json-0.52-source/splash/SplashT1FontEngine.o +0 -0
  250. data/pdf2json-0.52-source/splash/SplashT1FontFile.cc +97 -0
  251. data/pdf2json-0.52-source/splash/SplashT1FontFile.h +58 -0
  252. data/pdf2json-0.52-source/splash/SplashT1FontFile.o +0 -0
  253. data/pdf2json-0.52-source/splash/SplashTypes.h +132 -0
  254. data/pdf2json-0.52-source/splash/SplashXPath.cc +438 -0
  255. data/pdf2json-0.52-source/splash/SplashXPath.h +100 -0
  256. data/pdf2json-0.52-source/splash/SplashXPath.o +0 -0
  257. data/pdf2json-0.52-source/splash/SplashXPathScanner.cc +428 -0
  258. data/pdf2json-0.52-source/splash/SplashXPathScanner.h +87 -0
  259. data/pdf2json-0.52-source/splash/SplashXPathScanner.o +0 -0
  260. data/pdf2json-0.52-source/splash/libsplash.a +0 -0
  261. data/pdf2json-0.52-source/splash/vms_make.com +0 -0
  262. data/pdf2json-0.52-source/src/.DS_Store +0 -0
  263. data/pdf2json-0.52-source/src/GVector.h +101 -0
  264. data/pdf2json-0.52-source/src/ImgOutputDev.cc +1243 -0
  265. data/pdf2json-0.52-source/src/ImgOutputDev.h +307 -0
  266. data/pdf2json-0.52-source/src/ImgOutputDev.o +0 -0
  267. data/pdf2json-0.52-source/src/Makefile +68 -0
  268. data/pdf2json-0.52-source/src/Makefile.in +68 -0
  269. data/pdf2json-0.52-source/src/XmlFonts.cc +367 -0
  270. data/pdf2json-0.52-source/src/XmlFonts.h +91 -0
  271. data/pdf2json-0.52-source/src/XmlFonts.o +0 -0
  272. data/pdf2json-0.52-source/src/XmlLinks.cc +101 -0
  273. data/pdf2json-0.52-source/src/XmlLinks.h +54 -0
  274. data/pdf2json-0.52-source/src/XmlLinks.o +0 -0
  275. data/pdf2json-0.52-source/src/pdf2json +0 -0
  276. data/pdf2json-0.52-source/src/pdf2json.cc +343 -0
  277. data/pdf2json-0.52-source/src/pdf2json.o +0 -0
  278. data/pdf2json-0.52-source/src/pdf2xml.dtd +22 -0
  279. data/pdf2json-0.52-source/src/pdf2xmljson.dtd +9 -0
  280. data/pdf2json-0.52-source/xpdf/.DS_Store +0 -0
  281. data/pdf2json-0.52-source/xpdf/Annot.cc +1556 -0
  282. data/pdf2json-0.52-source/xpdf/Annot.h +142 -0
  283. data/pdf2json-0.52-source/xpdf/Annot.o +0 -0
  284. data/pdf2json-0.52-source/xpdf/Array.cc +73 -0
  285. data/pdf2json-0.52-source/xpdf/Array.h +58 -0
  286. data/pdf2json-0.52-source/xpdf/Array.o +0 -0
  287. data/pdf2json-0.52-source/xpdf/BuiltinFont.cc +65 -0
  288. data/pdf2json-0.52-source/xpdf/BuiltinFont.h +57 -0
  289. data/pdf2json-0.52-source/xpdf/BuiltinFont.o +0 -0
  290. data/pdf2json-0.52-source/xpdf/BuiltinFontTables.cc +4284 -0
  291. data/pdf2json-0.52-source/xpdf/BuiltinFontTables.h +23 -0
  292. data/pdf2json-0.52-source/xpdf/BuiltinFontTables.o +0 -0
  293. data/pdf2json-0.52-source/xpdf/CMap.cc +408 -0
  294. data/pdf2json-0.52-source/xpdf/CMap.h +102 -0
  295. data/pdf2json-0.52-source/xpdf/CMap.o +0 -0
  296. data/pdf2json-0.52-source/xpdf/Catalog.cc +374 -0
  297. data/pdf2json-0.52-source/xpdf/Catalog.h +97 -0
  298. data/pdf2json-0.52-source/xpdf/Catalog.o +0 -0
  299. data/pdf2json-0.52-source/xpdf/CharCodeToUnicode.cc +540 -0
  300. data/pdf2json-0.52-source/xpdf/CharCodeToUnicode.h +117 -0
  301. data/pdf2json-0.52-source/xpdf/CharCodeToUnicode.o +0 -0
  302. data/pdf2json-0.52-source/xpdf/CharTypes.h +24 -0
  303. data/pdf2json-0.52-source/xpdf/CompactFontTables.h +464 -0
  304. data/pdf2json-0.52-source/xpdf/CoreOutputDev.cc +61 -0
  305. data/pdf2json-0.52-source/xpdf/CoreOutputDev.h +61 -0
  306. data/pdf2json-0.52-source/xpdf/Decrypt.cc +776 -0
  307. data/pdf2json-0.52-source/xpdf/Decrypt.h +95 -0
  308. data/pdf2json-0.52-source/xpdf/Decrypt.o +0 -0
  309. data/pdf2json-0.52-source/xpdf/Dict.cc +95 -0
  310. data/pdf2json-0.52-source/xpdf/Dict.h +77 -0
  311. data/pdf2json-0.52-source/xpdf/Dict.o +0 -0
  312. data/pdf2json-0.52-source/xpdf/Error.cc +38 -0
  313. data/pdf2json-0.52-source/xpdf/Error.h +23 -0
  314. data/pdf2json-0.52-source/xpdf/Error.o +0 -0
  315. data/pdf2json-0.52-source/xpdf/ErrorCodes.h +36 -0
  316. data/pdf2json-0.52-source/xpdf/FontEncodingTables.cc +1824 -0
  317. data/pdf2json-0.52-source/xpdf/FontEncodingTables.h +20 -0
  318. data/pdf2json-0.52-source/xpdf/FontEncodingTables.o +0 -0
  319. data/pdf2json-0.52-source/xpdf/Function.cc +1573 -0
  320. data/pdf2json-0.52-source/xpdf/Function.h +229 -0
  321. data/pdf2json-0.52-source/xpdf/Function.o +0 -0
  322. data/pdf2json-0.52-source/xpdf/Gfx.cc +4187 -0
  323. data/pdf2json-0.52-source/xpdf/Gfx.h +312 -0
  324. data/pdf2json-0.52-source/xpdf/Gfx.o +0 -0
  325. data/pdf2json-0.52-source/xpdf/GfxFont.cc +1568 -0
  326. data/pdf2json-0.52-source/xpdf/GfxFont.h +320 -0
  327. data/pdf2json-0.52-source/xpdf/GfxFont.o +0 -0
  328. data/pdf2json-0.52-source/xpdf/GfxState.cc +4137 -0
  329. data/pdf2json-0.52-source/xpdf/GfxState.h +1244 -0
  330. data/pdf2json-0.52-source/xpdf/GfxState.o +0 -0
  331. data/pdf2json-0.52-source/xpdf/GlobalParams.cc +2924 -0
  332. data/pdf2json-0.52-source/xpdf/GlobalParams.cc.old +2908 -0
  333. data/pdf2json-0.52-source/xpdf/GlobalParams.h +466 -0
  334. data/pdf2json-0.52-source/xpdf/GlobalParams.h.old +463 -0
  335. data/pdf2json-0.52-source/xpdf/GlobalParams.o +0 -0
  336. data/pdf2json-0.52-source/xpdf/ImageOutputDev.cc +195 -0
  337. data/pdf2json-0.52-source/xpdf/ImageOutputDev.h +76 -0
  338. data/pdf2json-0.52-source/xpdf/ImageOutputDev.o +0 -0
  339. data/pdf2json-0.52-source/xpdf/JArithmeticDecoder.cc +322 -0
  340. data/pdf2json-0.52-source/xpdf/JArithmeticDecoder.h +109 -0
  341. data/pdf2json-0.52-source/xpdf/JArithmeticDecoder.o +0 -0
  342. data/pdf2json-0.52-source/xpdf/JBIG2Stream.cc +3413 -0
  343. data/pdf2json-0.52-source/xpdf/JBIG2Stream.h +145 -0
  344. data/pdf2json-0.52-source/xpdf/JBIG2Stream.o +0 -0
  345. data/pdf2json-0.52-source/xpdf/JPXStream.cc +3144 -0
  346. data/pdf2json-0.52-source/xpdf/JPXStream.h +351 -0
  347. data/pdf2json-0.52-source/xpdf/JPXStream.o +0 -0
  348. data/pdf2json-0.52-source/xpdf/Lexer.cc +485 -0
  349. data/pdf2json-0.52-source/xpdf/Lexer.h +80 -0
  350. data/pdf2json-0.52-source/xpdf/Lexer.o +0 -0
  351. data/pdf2json-0.52-source/xpdf/Link.cc +806 -0
  352. data/pdf2json-0.52-source/xpdf/Link.cc.old +784 -0
  353. data/pdf2json-0.52-source/xpdf/Link.h +415 -0
  354. data/pdf2json-0.52-source/xpdf/Link.h.old +369 -0
  355. data/pdf2json-0.52-source/xpdf/Link.o +0 -0
  356. data/pdf2json-0.52-source/xpdf/Makefile +232 -0
  357. data/pdf2json-0.52-source/xpdf/Makefile.dep +0 -0
  358. data/pdf2json-0.52-source/xpdf/Makefile.in +232 -0
  359. data/pdf2json-0.52-source/xpdf/NameToCharCode.cc +116 -0
  360. data/pdf2json-0.52-source/xpdf/NameToCharCode.h +42 -0
  361. data/pdf2json-0.52-source/xpdf/NameToCharCode.o +0 -0
  362. data/pdf2json-0.52-source/xpdf/NameToUnicodeTable.h +1097 -0
  363. data/pdf2json-0.52-source/xpdf/Object.cc +231 -0
  364. data/pdf2json-0.52-source/xpdf/Object.h +303 -0
  365. data/pdf2json-0.52-source/xpdf/Object.o +0 -0
  366. data/pdf2json-0.52-source/xpdf/Outline.cc +151 -0
  367. data/pdf2json-0.52-source/xpdf/Outline.h +76 -0
  368. data/pdf2json-0.52-source/xpdf/Outline.o +0 -0
  369. data/pdf2json-0.52-source/xpdf/OutputDev.cc +131 -0
  370. data/pdf2json-0.52-source/xpdf/OutputDev.h +253 -0
  371. data/pdf2json-0.52-source/xpdf/OutputDev.o +0 -0
  372. data/pdf2json-0.52-source/xpdf/PDFCore.cc +2044 -0
  373. data/pdf2json-0.52-source/xpdf/PDFCore.h +321 -0
  374. data/pdf2json-0.52-source/xpdf/PDFDoc.cc +404 -0
  375. data/pdf2json-0.52-source/xpdf/PDFDoc.h +183 -0
  376. data/pdf2json-0.52-source/xpdf/PDFDoc.o +0 -0
  377. data/pdf2json-0.52-source/xpdf/PDFDocEncoding.cc +44 -0
  378. data/pdf2json-0.52-source/xpdf/PDFDocEncoding.h +16 -0
  379. data/pdf2json-0.52-source/xpdf/PDFDocEncoding.o +0 -0
  380. data/pdf2json-0.52-source/xpdf/PSOutputDev.cc +6224 -0
  381. data/pdf2json-0.52-source/xpdf/PSOutputDev.h +395 -0
  382. data/pdf2json-0.52-source/xpdf/PSOutputDev.o +0 -0
  383. data/pdf2json-0.52-source/xpdf/PSTokenizer.cc +135 -0
  384. data/pdf2json-0.52-source/xpdf/PSTokenizer.h +41 -0
  385. data/pdf2json-0.52-source/xpdf/PSTokenizer.o +0 -0
  386. data/pdf2json-0.52-source/xpdf/Page.cc +454 -0
  387. data/pdf2json-0.52-source/xpdf/Page.h +187 -0
  388. data/pdf2json-0.52-source/xpdf/Page.o +0 -0
  389. data/pdf2json-0.52-source/xpdf/Parser.cc +227 -0
  390. data/pdf2json-0.52-source/xpdf/Parser.h +59 -0
  391. data/pdf2json-0.52-source/xpdf/Parser.o +0 -0
  392. data/pdf2json-0.52-source/xpdf/PreScanOutputDev.cc +257 -0
  393. data/pdf2json-0.52-source/xpdf/PreScanOutputDev.h +130 -0
  394. data/pdf2json-0.52-source/xpdf/PreScanOutputDev.o +0 -0
  395. data/pdf2json-0.52-source/xpdf/SecurityHandler.cc +390 -0
  396. data/pdf2json-0.52-source/xpdf/SecurityHandler.h +160 -0
  397. data/pdf2json-0.52-source/xpdf/SecurityHandler.o +0 -0
  398. data/pdf2json-0.52-source/xpdf/SplashOutputDev.cc +2845 -0
  399. data/pdf2json-0.52-source/xpdf/SplashOutputDev.h +247 -0
  400. data/pdf2json-0.52-source/xpdf/SplashOutputDev.o +0 -0
  401. data/pdf2json-0.52-source/xpdf/Stream-CCITT.h +459 -0
  402. data/pdf2json-0.52-source/xpdf/Stream.cc +4627 -0
  403. data/pdf2json-0.52-source/xpdf/Stream.h +858 -0
  404. data/pdf2json-0.52-source/xpdf/Stream.o +0 -0
  405. data/pdf2json-0.52-source/xpdf/TextOutputDev.cc +4090 -0
  406. data/pdf2json-0.52-source/xpdf/TextOutputDev.h +661 -0
  407. data/pdf2json-0.52-source/xpdf/TextOutputDev.o +0 -0
  408. data/pdf2json-0.52-source/xpdf/UTF8.h +56 -0
  409. data/pdf2json-0.52-source/xpdf/UnicodeMap.cc +302 -0
  410. data/pdf2json-0.52-source/xpdf/UnicodeMap.cc.old +293 -0
  411. data/pdf2json-0.52-source/xpdf/UnicodeMap.h +135 -0
  412. data/pdf2json-0.52-source/xpdf/UnicodeMap.h.old +123 -0
  413. data/pdf2json-0.52-source/xpdf/UnicodeMap.o +0 -0
  414. data/pdf2json-0.52-source/xpdf/UnicodeMapTables.h +361 -0
  415. data/pdf2json-0.52-source/xpdf/UnicodeTypeTable.cc +949 -0
  416. data/pdf2json-0.52-source/xpdf/UnicodeTypeTable.h +20 -0
  417. data/pdf2json-0.52-source/xpdf/UnicodeTypeTable.o +0 -0
  418. data/pdf2json-0.52-source/xpdf/XPDFApp.cc +447 -0
  419. data/pdf2json-0.52-source/xpdf/XPDFApp.h +114 -0
  420. data/pdf2json-0.52-source/xpdf/XPDFCore.cc +1655 -0
  421. data/pdf2json-0.52-source/xpdf/XPDFCore.h +251 -0
  422. data/pdf2json-0.52-source/xpdf/XPDFTree.cc +931 -0
  423. data/pdf2json-0.52-source/xpdf/XPDFTree.h +45 -0
  424. data/pdf2json-0.52-source/xpdf/XPDFTreeP.h +87 -0
  425. data/pdf2json-0.52-source/xpdf/XPDFViewer.cc +3488 -0
  426. data/pdf2json-0.52-source/xpdf/XPDFViewer.h +352 -0
  427. data/pdf2json-0.52-source/xpdf/XRef.cc +896 -0
  428. data/pdf2json-0.52-source/xpdf/XRef.h +133 -0
  429. data/pdf2json-0.52-source/xpdf/XRef.o +0 -0
  430. data/pdf2json-0.52-source/xpdf/XpdfPluginAPI.cc +262 -0
  431. data/pdf2json-0.52-source/xpdf/XpdfPluginAPI.h +341 -0
  432. data/pdf2json-0.52-source/xpdf/XpdfPluginAPI.o +0 -0
  433. data/pdf2json-0.52-source/xpdf/about-text.h +48 -0
  434. data/pdf2json-0.52-source/xpdf/about.xbm +6 -0
  435. data/pdf2json-0.52-source/xpdf/backArrow.xbm +6 -0
  436. data/pdf2json-0.52-source/xpdf/backArrowDis.xbm +6 -0
  437. data/pdf2json-0.52-source/xpdf/config.h +112 -0
  438. data/pdf2json-0.52-source/xpdf/dblLeftArrow.xbm +6 -0
  439. data/pdf2json-0.52-source/xpdf/dblLeftArrowDis.xbm +6 -0
  440. data/pdf2json-0.52-source/xpdf/dblRightArrow.xbm +6 -0
  441. data/pdf2json-0.52-source/xpdf/dblRightArrowDis.xbm +6 -0
  442. data/pdf2json-0.52-source/xpdf/find.xbm +6 -0
  443. data/pdf2json-0.52-source/xpdf/findDis.xbm +6 -0
  444. data/pdf2json-0.52-source/xpdf/forwardArrow.xbm +6 -0
  445. data/pdf2json-0.52-source/xpdf/forwardArrowDis.xbm +6 -0
  446. data/pdf2json-0.52-source/xpdf/leftArrow.xbm +5 -0
  447. data/pdf2json-0.52-source/xpdf/leftArrowDis.xbm +5 -0
  448. data/pdf2json-0.52-source/xpdf/libXpdf.a +0 -0
  449. data/pdf2json-0.52-source/xpdf/pdffonts +0 -0
  450. data/pdf2json-0.52-source/xpdf/pdffonts.cc +298 -0
  451. data/pdf2json-0.52-source/xpdf/pdffonts.o +0 -0
  452. data/pdf2json-0.52-source/xpdf/pdfimages +0 -0
  453. data/pdf2json-0.52-source/xpdf/pdfimages.cc +155 -0
  454. data/pdf2json-0.52-source/xpdf/pdfimages.o +0 -0
  455. data/pdf2json-0.52-source/xpdf/pdfinfo +0 -0
  456. data/pdf2json-0.52-source/xpdf/pdfinfo.cc +387 -0
  457. data/pdf2json-0.52-source/xpdf/pdfinfo.o +0 -0
  458. data/pdf2json-0.52-source/xpdf/pdftoppm.cc +203 -0
  459. data/pdf2json-0.52-source/xpdf/pdftops +0 -0
  460. data/pdf2json-0.52-source/xpdf/pdftops.cc +344 -0
  461. data/pdf2json-0.52-source/xpdf/pdftops.o +0 -0
  462. data/pdf2json-0.52-source/xpdf/pdftotext +0 -0
  463. data/pdf2json-0.52-source/xpdf/pdftotext.cc +333 -0
  464. data/pdf2json-0.52-source/xpdf/pdftotext.o +0 -0
  465. data/pdf2json-0.52-source/xpdf/print.xbm +6 -0
  466. data/pdf2json-0.52-source/xpdf/printDis.xbm +6 -0
  467. data/pdf2json-0.52-source/xpdf/rightArrow.xbm +5 -0
  468. data/pdf2json-0.52-source/xpdf/rightArrowDis.xbm +5 -0
  469. data/pdf2json-0.52-source/xpdf/vms_make.com +129 -0
  470. data/pdf2json-0.52-source/xpdf/xpdf.cc +344 -0
  471. data/pdf2json-0.52-source/xpdf/xpdfIcon.xpm +62 -0
  472. data/pdf2json.gemspec +29 -0
  473. metadata +518 -0
Binary file
@@ -0,0 +1,4090 @@
1
+ //========================================================================
2
+ //
3
+ // TextOutputDev.cc
4
+ //
5
+ // Copyright 1997-2003 Glyph & Cog, LLC
6
+ //
7
+ //========================================================================
8
+
9
+ #include <aconf.h>
10
+
11
+ #ifdef USE_GCC_PRAGMAS
12
+ #pragma implementation
13
+ #endif
14
+
15
+ #include <stdio.h>
16
+ #include <stdlib.h>
17
+ #include <stddef.h>
18
+ #include <math.h>
19
+ #include <ctype.h>
20
+ #ifdef WIN32
21
+ #include <fcntl.h> // for O_BINARY
22
+ #include <io.h> // for setmode
23
+ #endif
24
+ #include "gmem.h"
25
+ #include "GString.h"
26
+ #include "GList.h"
27
+ #include "config.h"
28
+ #include "Error.h"
29
+ #include "GlobalParams.h"
30
+ #include "UnicodeMap.h"
31
+ #include "UnicodeTypeTable.h"
32
+ #include "GfxState.h"
33
+ #include "Link.h"
34
+ #include "TextOutputDev.h"
35
+
36
+ #ifdef MACOS
37
+ // needed for setting type/creator of MacOS files
38
+ #include "ICSupport.h"
39
+ #endif
40
+
41
+ //------------------------------------------------------------------------
42
+ // parameters
43
+ //------------------------------------------------------------------------
44
+
45
+ // Each bucket in a text pool includes baselines within a range of
46
+ // this many points.
47
+ #define textPoolStep 4
48
+
49
+ // Inter-character space width which will cause addChar to start a new
50
+ // word.
51
+ #define minWordBreakSpace 0.1
52
+
53
+ // Negative inter-character space width, i.e., overlap, which will
54
+ // cause addChar to start a new word.
55
+ #define minDupBreakOverlap 0.2
56
+
57
+ // Max distance between baselines of two lines within a block, as a
58
+ // fraction of the font size.
59
+ #define maxLineSpacingDelta 1.5
60
+
61
+ // Max difference in primary font sizes on two lines in the same
62
+ // block. Delta1 is used when examining new lines above and below the
63
+ // current block; delta2 is used when examining text that overlaps the
64
+ // current block; delta3 is used when examining text to the left and
65
+ // right of the current block.
66
+ #define maxBlockFontSizeDelta1 0.05
67
+ #define maxBlockFontSizeDelta2 0.6
68
+ #define maxBlockFontSizeDelta3 0.2
69
+
70
+ // Max difference in font sizes inside a word.
71
+ #define maxWordFontSizeDelta 0.05
72
+
73
+ // Maximum distance between baselines of two words on the same line,
74
+ // e.g., distance between subscript or superscript and the primary
75
+ // baseline, as a fraction of the font size.
76
+ #define maxIntraLineDelta 0.5
77
+
78
+ // Minimum inter-word spacing, as a fraction of the font size. (Only
79
+ // used for raw ordering.)
80
+ #define minWordSpacing 0.15
81
+
82
+ // Maximum inter-word spacing, as a fraction of the font size.
83
+ #define maxWordSpacing 1.5
84
+
85
+ // Maximum horizontal spacing which will allow a word to be pulled
86
+ // into a block.
87
+ #define minColSpacing1 0.3
88
+
89
+ // Minimum spacing between columns, as a fraction of the font size.
90
+ #define minColSpacing2 1.0
91
+
92
+ // Maximum vertical spacing between blocks within a flow, as a
93
+ // multiple of the font size.
94
+ #define maxBlockSpacing 2.5
95
+
96
+ // Minimum spacing between characters within a word, as a fraction of
97
+ // the font size.
98
+ #define minCharSpacing -0.2
99
+
100
+ // Maximum spacing between characters within a word, as a fraction of
101
+ // the font size, when there is no obvious extra-wide character
102
+ // spacing.
103
+ #define maxCharSpacing 0.03
104
+
105
+ // When extra-wide character spacing is detected, the inter-character
106
+ // space threshold is set to the minimum inter-character space
107
+ // multiplied by this constant.
108
+ #define maxWideCharSpacingMul 1.3
109
+
110
+ // Upper limit on spacing between characters in a word.
111
+ #define maxWideCharSpacing 0.4
112
+
113
+ // Max difference in primary,secondary coordinates (as a fraction of
114
+ // the font size) allowed for duplicated text (fake boldface, drop
115
+ // shadows) which is to be discarded.
116
+ #define dupMaxPriDelta 0.1
117
+ #define dupMaxSecDelta 0.2
118
+
119
+ // Max width of underlines (in points).
120
+ #define maxUnderlineWidth 3
121
+
122
+ // Min distance between baseline and underline (in points).
123
+ //~ this should be font-size-dependent
124
+ #define minUnderlineGap -2
125
+
126
+ // Max distance between baseline and underline (in points).
127
+ //~ this should be font-size-dependent
128
+ #define maxUnderlineGap 4
129
+
130
+ // Max horizontal distance between edge of word and start of underline
131
+ // (in points).
132
+ //~ this should be font-size-dependent
133
+ #define underlineSlack 1
134
+
135
+ // Max distance between edge of text and edge of link border
136
+ #define hyperlinkSlack 2
137
+
138
+ //------------------------------------------------------------------------
139
+ // TextUnderline
140
+ //------------------------------------------------------------------------
141
+
142
+ class TextUnderline {
143
+ public:
144
+
145
+ TextUnderline(double x0A, double y0A, double x1A, double y1A)
146
+ { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
147
+ ~TextUnderline() {}
148
+
149
+ double x0, y0, x1, y1;
150
+ GBool horiz;
151
+ };
152
+
153
+ //------------------------------------------------------------------------
154
+ // TextLink
155
+ //------------------------------------------------------------------------
156
+
157
+ class TextLink {
158
+ public:
159
+
160
+ TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, Link *linkA)
161
+ { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; link = linkA; }
162
+ ~TextLink() {}
163
+
164
+ int xMin, yMin, xMax, yMax;
165
+ Link *link;
166
+ };
167
+
168
+ //------------------------------------------------------------------------
169
+ // TextFontInfo
170
+ //------------------------------------------------------------------------
171
+
172
+ TextFontInfo::TextFontInfo(GfxState *state) {
173
+ gfxFont = state->getFont();
174
+ #if TEXTOUT_WORD_LIST
175
+ fontName = (gfxFont && gfxFont->getOrigName())
176
+ ? gfxFont->getOrigName()->copy()
177
+ : (GString *)NULL;
178
+ flags = gfxFont ? gfxFont->getFlags() : 0;
179
+ #endif
180
+ }
181
+
182
+ TextFontInfo::~TextFontInfo() {
183
+ #if TEXTOUT_WORD_LIST
184
+ if (fontName) {
185
+ delete fontName;
186
+ }
187
+ #endif
188
+ }
189
+
190
+ GBool TextFontInfo::matches(GfxState *state) {
191
+ return state->getFont() == gfxFont;
192
+ }
193
+
194
+ //------------------------------------------------------------------------
195
+ // TextWord
196
+ //------------------------------------------------------------------------
197
+
198
+ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
199
+ int charPosA, TextFontInfo *fontA, double fontSizeA) {
200
+ GfxFont *gfxFont;
201
+ double x, y, ascent, descent;
202
+
203
+ rot = rotA;
204
+ charPos = charPosA;
205
+ charLen = 0;
206
+ font = fontA;
207
+ fontSize = fontSizeA;
208
+ state->transform(x0, y0, &x, &y);
209
+ if ((gfxFont = font->gfxFont)) {
210
+ ascent = gfxFont->getAscent() * fontSize;
211
+ descent = gfxFont->getDescent() * fontSize;
212
+ } else {
213
+ // this means that the PDF file draws text without a current font,
214
+ // which should never happen
215
+ ascent = 0.95 * fontSize;
216
+ descent = -0.35 * fontSize;
217
+ }
218
+ switch (rot) {
219
+ case 0:
220
+ yMin = y - ascent;
221
+ yMax = y - descent;
222
+ if (yMin == yMax) {
223
+ // this is a sanity check for a case that shouldn't happen -- but
224
+ // if it does happen, we want to avoid dividing by zero later
225
+ yMin = y;
226
+ yMax = y + 1;
227
+ }
228
+ base = y;
229
+ break;
230
+ case 1:
231
+ xMin = x + descent;
232
+ xMax = x + ascent;
233
+ if (xMin == xMax) {
234
+ // this is a sanity check for a case that shouldn't happen -- but
235
+ // if it does happen, we want to avoid dividing by zero later
236
+ xMin = x;
237
+ xMax = x + 1;
238
+ }
239
+ base = x;
240
+ break;
241
+ case 2:
242
+ yMin = y + descent;
243
+ yMax = y + ascent;
244
+ if (yMin == yMax) {
245
+ // this is a sanity check for a case that shouldn't happen -- but
246
+ // if it does happen, we want to avoid dividing by zero later
247
+ yMin = y;
248
+ yMax = y + 1;
249
+ }
250
+ base = y;
251
+ break;
252
+ case 3:
253
+ xMin = x - ascent;
254
+ xMax = x - descent;
255
+ if (xMin == xMax) {
256
+ // this is a sanity check for a case that shouldn't happen -- but
257
+ // if it does happen, we want to avoid dividing by zero later
258
+ xMin = x;
259
+ xMax = x + 1;
260
+ }
261
+ base = x;
262
+ break;
263
+ }
264
+ text = NULL;
265
+ edge = NULL;
266
+ len = size = 0;
267
+ spaceAfter = gFalse;
268
+ next = NULL;
269
+
270
+ #if TEXTOUT_WORD_LIST
271
+ GfxRGB rgb;
272
+
273
+ if ((state->getRender() & 3) == 1) {
274
+ state->getStrokeRGB(&rgb);
275
+ } else {
276
+ state->getFillRGB(&rgb);
277
+ }
278
+ colorR = colToDbl(rgb.r);
279
+ colorG = colToDbl(rgb.g);
280
+ colorB = colToDbl(rgb.b);
281
+ #endif
282
+
283
+ underlined = gFalse;
284
+ link = NULL;
285
+ }
286
+
287
+ TextWord::~TextWord() {
288
+ gfree(text);
289
+ gfree(edge);
290
+ }
291
+
292
+ void TextWord::addChar(GfxState *state, double x, double y,
293
+ double dx, double dy, Unicode u) {
294
+ if (len == size) {
295
+ size += 16;
296
+ text = (Unicode *)greallocn(text, size, sizeof(Unicode));
297
+ edge = (double *)greallocn(edge, size + 1, sizeof(double));
298
+ }
299
+ text[len] = u;
300
+ switch (rot) {
301
+ case 0:
302
+ if (len == 0) {
303
+ xMin = x;
304
+ }
305
+ edge[len] = x;
306
+ xMax = edge[len+1] = x + dx;
307
+ break;
308
+ case 1:
309
+ if (len == 0) {
310
+ yMin = y;
311
+ }
312
+ edge[len] = y;
313
+ yMax = edge[len+1] = y + dy;
314
+ break;
315
+ case 2:
316
+ if (len == 0) {
317
+ xMax = x;
318
+ }
319
+ edge[len] = x;
320
+ xMin = edge[len+1] = x + dx;
321
+ break;
322
+ case 3:
323
+ if (len == 0) {
324
+ yMax = y;
325
+ }
326
+ edge[len] = y;
327
+ yMin = edge[len+1] = y + dy;
328
+ break;
329
+ }
330
+ ++len;
331
+ }
332
+
333
+ void TextWord::merge(TextWord *word) {
334
+ int i;
335
+
336
+ if (word->xMin < xMin) {
337
+ xMin = word->xMin;
338
+ }
339
+ if (word->yMin < yMin) {
340
+ yMin = word->yMin;
341
+ }
342
+ if (word->xMax > xMax) {
343
+ xMax = word->xMax;
344
+ }
345
+ if (word->yMax > yMax) {
346
+ yMax = word->yMax;
347
+ }
348
+ if (len + word->len > size) {
349
+ size = len + word->len;
350
+ text = (Unicode *)greallocn(text, size, sizeof(Unicode));
351
+ edge = (double *)greallocn(edge, size + 1, sizeof(double));
352
+ }
353
+ for (i = 0; i < word->len; ++i) {
354
+ text[len + i] = word->text[i];
355
+ edge[len + i] = word->edge[i];
356
+ }
357
+ edge[len + word->len] = word->edge[word->len];
358
+ len += word->len;
359
+ charLen += word->charLen;
360
+ }
361
+
362
+ inline int TextWord::primaryCmp(TextWord *word) {
363
+ double cmp;
364
+
365
+ cmp = 0; // make gcc happy
366
+ switch (rot) {
367
+ case 0:
368
+ cmp = xMin - word->xMin;
369
+ break;
370
+ case 1:
371
+ cmp = yMin - word->yMin;
372
+ break;
373
+ case 2:
374
+ cmp = word->xMax - xMax;
375
+ break;
376
+ case 3:
377
+ cmp = word->yMax - yMax;
378
+ break;
379
+ }
380
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
381
+ }
382
+
383
+ double TextWord::primaryDelta(TextWord *word) {
384
+ double delta;
385
+
386
+ delta = 0; // make gcc happy
387
+ switch (rot) {
388
+ case 0:
389
+ delta = word->xMin - xMax;
390
+ break;
391
+ case 1:
392
+ delta = word->yMin - yMax;
393
+ break;
394
+ case 2:
395
+ delta = xMin - word->xMax;
396
+ break;
397
+ case 3:
398
+ delta = yMin - word->yMax;
399
+ break;
400
+ }
401
+ return delta;
402
+ }
403
+
404
+ int TextWord::cmpYX(const void *p1, const void *p2) {
405
+ TextWord *word1 = *(TextWord **)p1;
406
+ TextWord *word2 = *(TextWord **)p2;
407
+ double cmp;
408
+
409
+ cmp = word1->yMin - word2->yMin;
410
+ if (cmp == 0) {
411
+ cmp = word1->xMin - word2->xMin;
412
+ }
413
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
414
+ }
415
+
416
+ #if TEXTOUT_WORD_LIST
417
+
418
+ GString *TextWord::getText() {
419
+ GString *s;
420
+ UnicodeMap *uMap;
421
+ char buf[8];
422
+ int n, i;
423
+
424
+ s = new GString();
425
+ if (!(uMap = globalParams->getTextEncoding())) {
426
+ return s;
427
+ }
428
+ for (i = 0; i < len; ++i) {
429
+ n = uMap->mapUnicode(text[i], buf, sizeof(buf));
430
+ s->append(buf, n);
431
+ }
432
+ uMap->decRefCnt();
433
+ return s;
434
+ }
435
+
436
+ void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
437
+ double *xMaxA, double *yMaxA) {
438
+ if (charIdx < 0 || charIdx >= len) {
439
+ return;
440
+ }
441
+ switch (rot) {
442
+ case 0:
443
+ *xMinA = edge[charIdx];
444
+ *xMaxA = edge[charIdx + 1];
445
+ *yMinA = yMin;
446
+ *yMaxA = yMax;
447
+ break;
448
+ case 1:
449
+ *xMinA = xMin;
450
+ *xMaxA = xMax;
451
+ *yMinA = edge[charIdx];
452
+ *yMaxA = edge[charIdx + 1];
453
+ break;
454
+ case 2:
455
+ *xMinA = edge[charIdx + 1];
456
+ *xMaxA = edge[charIdx];
457
+ *yMinA = yMin;
458
+ *yMaxA = yMax;
459
+ break;
460
+ case 3:
461
+ *xMinA = xMin;
462
+ *xMaxA = xMax;
463
+ *yMinA = edge[charIdx + 1];
464
+ *yMaxA = edge[charIdx];
465
+ break;
466
+ }
467
+ }
468
+
469
+ #endif // TEXTOUT_WORD_LIST
470
+
471
+ //------------------------------------------------------------------------
472
+ // TextPool
473
+ //------------------------------------------------------------------------
474
+
475
+ TextPool::TextPool() {
476
+ minBaseIdx = 0;
477
+ maxBaseIdx = -1;
478
+ pool = NULL;
479
+ cursor = NULL;
480
+ cursorBaseIdx = -1;
481
+ }
482
+
483
+ TextPool::~TextPool() {
484
+ int baseIdx;
485
+ TextWord *word, *word2;
486
+
487
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
488
+ for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
489
+ word2 = word->next;
490
+ delete word;
491
+ }
492
+ }
493
+ gfree(pool);
494
+ }
495
+
496
+ int TextPool::getBaseIdx(double base) {
497
+ int baseIdx;
498
+
499
+ baseIdx = (int)(base / textPoolStep);
500
+ if (baseIdx < minBaseIdx) {
501
+ return minBaseIdx;
502
+ }
503
+ if (baseIdx > maxBaseIdx) {
504
+ return maxBaseIdx;
505
+ }
506
+ return baseIdx;
507
+ }
508
+
509
+ void TextPool::addWord(TextWord *word) {
510
+ TextWord **newPool;
511
+ int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
512
+ TextWord *w0, *w1;
513
+
514
+ // expand the array if needed
515
+ wordBaseIdx = (int)(word->base / textPoolStep);
516
+ if (minBaseIdx > maxBaseIdx) {
517
+ minBaseIdx = wordBaseIdx - 128;
518
+ maxBaseIdx = wordBaseIdx + 128;
519
+ pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1,
520
+ sizeof(TextWord *));
521
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
522
+ pool[baseIdx - minBaseIdx] = NULL;
523
+ }
524
+ } else if (wordBaseIdx < minBaseIdx) {
525
+ newMinBaseIdx = wordBaseIdx - 128;
526
+ newPool = (TextWord **)gmallocn(maxBaseIdx - newMinBaseIdx + 1,
527
+ sizeof(TextWord *));
528
+ for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
529
+ newPool[baseIdx - newMinBaseIdx] = NULL;
530
+ }
531
+ memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
532
+ (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
533
+ gfree(pool);
534
+ pool = newPool;
535
+ minBaseIdx = newMinBaseIdx;
536
+ } else if (wordBaseIdx > maxBaseIdx) {
537
+ newMaxBaseIdx = wordBaseIdx + 128;
538
+ pool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1,
539
+ sizeof(TextWord *));
540
+ for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
541
+ pool[baseIdx - minBaseIdx] = NULL;
542
+ }
543
+ maxBaseIdx = newMaxBaseIdx;
544
+ }
545
+
546
+ // insert the new word
547
+ if (cursor && wordBaseIdx == cursorBaseIdx &&
548
+ word->primaryCmp(cursor) > 0) {
549
+ w0 = cursor;
550
+ w1 = cursor->next;
551
+ } else {
552
+ w0 = NULL;
553
+ w1 = pool[wordBaseIdx - minBaseIdx];
554
+ }
555
+ for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
556
+ word->next = w1;
557
+ if (w0) {
558
+ w0->next = word;
559
+ } else {
560
+ pool[wordBaseIdx - minBaseIdx] = word;
561
+ }
562
+ cursor = word;
563
+ cursorBaseIdx = wordBaseIdx;
564
+ }
565
+
566
+ //------------------------------------------------------------------------
567
+ // TextLine
568
+ //------------------------------------------------------------------------
569
+
570
+ TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
571
+ blk = blkA;
572
+ rot = rotA;
573
+ xMin = yMin = 0;
574
+ xMax = yMax = -1;
575
+ base = baseA;
576
+ words = lastWord = NULL;
577
+ text = NULL;
578
+ edge = NULL;
579
+ col = NULL;
580
+ len = 0;
581
+ convertedLen = 0;
582
+ hyphenated = gFalse;
583
+ next = NULL;
584
+ }
585
+
586
+ TextLine::~TextLine() {
587
+ TextWord *word;
588
+
589
+ while (words) {
590
+ word = words;
591
+ words = words->next;
592
+ delete word;
593
+ }
594
+ gfree(text);
595
+ gfree(edge);
596
+ gfree(col);
597
+ }
598
+
599
+ void TextLine::addWord(TextWord *word) {
600
+ if (lastWord) {
601
+ lastWord->next = word;
602
+ } else {
603
+ words = word;
604
+ }
605
+ lastWord = word;
606
+
607
+ if (xMin > xMax) {
608
+ xMin = word->xMin;
609
+ xMax = word->xMax;
610
+ yMin = word->yMin;
611
+ yMax = word->yMax;
612
+ } else {
613
+ if (word->xMin < xMin) {
614
+ xMin = word->xMin;
615
+ }
616
+ if (word->xMax > xMax) {
617
+ xMax = word->xMax;
618
+ }
619
+ if (word->yMin < yMin) {
620
+ yMin = word->yMin;
621
+ }
622
+ if (word->yMax > yMax) {
623
+ yMax = word->yMax;
624
+ }
625
+ }
626
+ }
627
+
628
+ double TextLine::primaryDelta(TextLine *line) {
629
+ double delta;
630
+
631
+ delta = 0; // make gcc happy
632
+ switch (rot) {
633
+ case 0:
634
+ delta = line->xMin - xMax;
635
+ break;
636
+ case 1:
637
+ delta = line->yMin - yMax;
638
+ break;
639
+ case 2:
640
+ delta = xMin - line->xMax;
641
+ break;
642
+ case 3:
643
+ delta = yMin - line->yMax;
644
+ break;
645
+ }
646
+ return delta;
647
+ }
648
+
649
+ int TextLine::primaryCmp(TextLine *line) {
650
+ double cmp;
651
+
652
+ cmp = 0; // make gcc happy
653
+ switch (rot) {
654
+ case 0:
655
+ cmp = xMin - line->xMin;
656
+ break;
657
+ case 1:
658
+ cmp = yMin - line->yMin;
659
+ break;
660
+ case 2:
661
+ cmp = line->xMax - xMax;
662
+ break;
663
+ case 3:
664
+ cmp = line->yMax - yMax;
665
+ break;
666
+ }
667
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
668
+ }
669
+
670
+ int TextLine::secondaryCmp(TextLine *line) {
671
+ double cmp;
672
+
673
+ cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
674
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
675
+ }
676
+
677
+ int TextLine::cmpYX(TextLine *line) {
678
+ int cmp;
679
+
680
+ if ((cmp = secondaryCmp(line))) {
681
+ return cmp;
682
+ }
683
+ return primaryCmp(line);
684
+ }
685
+
686
+ int TextLine::cmpXY(const void *p1, const void *p2) {
687
+ TextLine *line1 = *(TextLine **)p1;
688
+ TextLine *line2 = *(TextLine **)p2;
689
+ int cmp;
690
+
691
+ if ((cmp = line1->primaryCmp(line2))) {
692
+ return cmp;
693
+ }
694
+ return line1->secondaryCmp(line2);
695
+ }
696
+
697
+ void TextLine::coalesce(UnicodeMap *uMap) {
698
+ TextWord *word0, *word1;
699
+ double space, delta, minSpace;
700
+ GBool isUnicode;
701
+ char buf[8];
702
+ int i, j;
703
+
704
+ if (words->next) {
705
+
706
+ // compute the inter-word space threshold
707
+ if (words->len > 1 || words->next->len > 1) {
708
+ minSpace = 0;
709
+ } else {
710
+ minSpace = words->primaryDelta(words->next);
711
+ for (word0 = words->next, word1 = word0->next;
712
+ word1 && minSpace > 0;
713
+ word0 = word1, word1 = word0->next) {
714
+ if (word1->len > 1) {
715
+ minSpace = 0;
716
+ }
717
+ delta = word0->primaryDelta(word1);
718
+ if (delta < minSpace) {
719
+ minSpace = delta;
720
+ }
721
+ }
722
+ }
723
+ if (minSpace <= 0) {
724
+ space = maxCharSpacing * words->fontSize;
725
+ } else {
726
+ space = maxWideCharSpacingMul * minSpace;
727
+ if (space > maxWideCharSpacing * words->fontSize) {
728
+ space = maxWideCharSpacing * words->fontSize;
729
+ }
730
+ }
731
+
732
+ // merge words
733
+ word0 = words;
734
+ word1 = words->next;
735
+ while (word1) {
736
+ if (word0->primaryDelta(word1) >= space) {
737
+ word0->spaceAfter = gTrue;
738
+ word0 = word1;
739
+ word1 = word1->next;
740
+ } else if (word0->font == word1->font &&
741
+ word0->underlined == word1->underlined &&
742
+ fabs(word0->fontSize - word1->fontSize) <
743
+ maxWordFontSizeDelta * words->fontSize &&
744
+ word1->charPos == word0->charPos + word0->charLen) {
745
+ word0->merge(word1);
746
+ word0->next = word1->next;
747
+ delete word1;
748
+ word1 = word0->next;
749
+ } else {
750
+ word0 = word1;
751
+ word1 = word1->next;
752
+ }
753
+ }
754
+ }
755
+
756
+ // build the line text
757
+ isUnicode = uMap ? uMap->isUnicode() : gFalse;
758
+ len = 0;
759
+ for (word1 = words; word1; word1 = word1->next) {
760
+ len += word1->len;
761
+ if (word1->spaceAfter) {
762
+ ++len;
763
+ }
764
+ }
765
+ text = (Unicode *)gmallocn(len, sizeof(Unicode));
766
+ edge = (double *)gmallocn(len + 1, sizeof(double));
767
+ i = 0;
768
+ for (word1 = words; word1; word1 = word1->next) {
769
+ for (j = 0; j < word1->len; ++j) {
770
+ text[i] = word1->text[j];
771
+ edge[i] = word1->edge[j];
772
+ ++i;
773
+ }
774
+ edge[i] = word1->edge[word1->len];
775
+ if (word1->spaceAfter) {
776
+ text[i] = (Unicode)0x0020;
777
+ ++i;
778
+ }
779
+ }
780
+
781
+ // compute convertedLen and set up the col array
782
+ col = (int *)gmallocn(len + 1, sizeof(int));
783
+ convertedLen = 0;
784
+ for (i = 0; i < len; ++i) {
785
+ col[i] = convertedLen;
786
+ if (isUnicode) {
787
+ ++convertedLen;
788
+ } else if (uMap) {
789
+ convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
790
+ }
791
+ }
792
+ col[len] = convertedLen;
793
+
794
+ // check for hyphen at end of line
795
+ //~ need to check for other chars used as hyphens
796
+ hyphenated = text[len - 1] == (Unicode)'-';
797
+ }
798
+
799
+ //------------------------------------------------------------------------
800
+ // TextLineFrag
801
+ //------------------------------------------------------------------------
802
+
803
+ class TextLineFrag {
804
+ public:
805
+
806
+ TextLine *line; // the line object
807
+ int start, len; // offset and length of this fragment
808
+ // (in Unicode chars)
809
+ double xMin, xMax; // bounding box coordinates
810
+ double yMin, yMax;
811
+ double base; // baseline virtual coordinate
812
+ int col; // first column
813
+
814
+ void init(TextLine *lineA, int startA, int lenA);
815
+ void computeCoords(GBool oneRot);
816
+
817
+ static int cmpYXPrimaryRot(const void *p1, const void *p2);
818
+ static int cmpYXLineRot(const void *p1, const void *p2);
819
+ static int cmpXYLineRot(const void *p1, const void *p2);
820
+ static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
821
+ static int cmpXYColumnLineRot(const void *p1, const void *p2);
822
+ };
823
+
824
+ void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
825
+ line = lineA;
826
+ start = startA;
827
+ len = lenA;
828
+ col = line->col[start];
829
+ }
830
+
831
+ void TextLineFrag::computeCoords(GBool oneRot) {
832
+ TextBlock *blk;
833
+ double d0, d1, d2, d3, d4;
834
+
835
+ if (oneRot) {
836
+
837
+ switch (line->rot) {
838
+ case 0:
839
+ xMin = line->edge[start];
840
+ xMax = line->edge[start + len];
841
+ yMin = line->yMin;
842
+ yMax = line->yMax;
843
+ break;
844
+ case 1:
845
+ xMin = line->xMin;
846
+ xMax = line->xMax;
847
+ yMin = line->edge[start];
848
+ yMax = line->edge[start + len];
849
+ break;
850
+ case 2:
851
+ xMin = line->edge[start + len];
852
+ xMax = line->edge[start];
853
+ yMin = line->yMin;
854
+ yMax = line->yMax;
855
+ break;
856
+ case 3:
857
+ xMin = line->xMin;
858
+ xMax = line->xMax;
859
+ yMin = line->edge[start + len];
860
+ yMax = line->edge[start];
861
+ break;
862
+ }
863
+ base = line->base;
864
+
865
+ } else {
866
+
867
+ if (line->rot == 0 && line->blk->page->primaryRot == 0) {
868
+
869
+ xMin = line->edge[start];
870
+ xMax = line->edge[start + len];
871
+ yMin = line->yMin;
872
+ yMax = line->yMax;
873
+ base = line->base;
874
+
875
+ } else {
876
+
877
+ blk = line->blk;
878
+ d0 = line->edge[start];
879
+ d1 = line->edge[start + len];
880
+ d2 = d3 = d4 = 0; // make gcc happy
881
+
882
+ switch (line->rot) {
883
+ case 0:
884
+ d2 = line->yMin;
885
+ d3 = line->yMax;
886
+ d4 = line->base;
887
+ d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
888
+ d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
889
+ d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
890
+ d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
891
+ d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
892
+ break;
893
+ case 1:
894
+ d2 = line->xMax;
895
+ d3 = line->xMin;
896
+ d4 = line->base;
897
+ d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
898
+ d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
899
+ d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
900
+ d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
901
+ d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
902
+ break;
903
+ case 2:
904
+ d2 = line->yMax;
905
+ d3 = line->yMin;
906
+ d4 = line->base;
907
+ d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
908
+ d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
909
+ d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
910
+ d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
911
+ d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
912
+ break;
913
+ case 3:
914
+ d2 = line->xMin;
915
+ d3 = line->xMax;
916
+ d4 = line->base;
917
+ d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
918
+ d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
919
+ d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
920
+ d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
921
+ d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
922
+ break;
923
+ }
924
+
925
+ switch (line->blk->page->primaryRot) {
926
+ case 0:
927
+ xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
928
+ xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
929
+ yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
930
+ yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
931
+ base = blk->yMin + base * (blk->yMax - blk->yMin);
932
+ break;
933
+ case 1:
934
+ xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
935
+ xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
936
+ yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
937
+ yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
938
+ base = blk->xMax - d4 * (blk->xMax - blk->xMin);
939
+ break;
940
+ case 2:
941
+ xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
942
+ xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
943
+ yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
944
+ yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
945
+ base = blk->yMax - d4 * (blk->yMax - blk->yMin);
946
+ break;
947
+ case 3:
948
+ xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
949
+ xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
950
+ yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
951
+ yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
952
+ base = blk->xMin + d4 * (blk->xMax - blk->xMin);
953
+ break;
954
+ }
955
+
956
+ }
957
+ }
958
+ }
959
+
960
+ int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
961
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
962
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
963
+ double cmp;
964
+
965
+ cmp = 0; // make gcc happy
966
+ switch (frag1->line->blk->page->primaryRot) {
967
+ case 0:
968
+ if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
969
+ cmp = frag1->xMin - frag2->xMin;
970
+ }
971
+ break;
972
+ case 1:
973
+ if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
974
+ cmp = frag1->yMin - frag2->yMin;
975
+ }
976
+ break;
977
+ case 2:
978
+ if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
979
+ cmp = frag2->xMax - frag1->xMax;
980
+ }
981
+ break;
982
+ case 3:
983
+ if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
984
+ cmp = frag2->yMax - frag1->yMax;
985
+ }
986
+ break;
987
+ }
988
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
989
+ }
990
+
991
+ int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
992
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
993
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
994
+ double cmp;
995
+
996
+ cmp = 0; // make gcc happy
997
+ switch (frag1->line->rot) {
998
+ case 0:
999
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1000
+ cmp = frag1->xMin - frag2->xMin;
1001
+ }
1002
+ break;
1003
+ case 1:
1004
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1005
+ cmp = frag1->yMin - frag2->yMin;
1006
+ }
1007
+ break;
1008
+ case 2:
1009
+ if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1010
+ cmp = frag2->xMax - frag1->xMax;
1011
+ }
1012
+ break;
1013
+ case 3:
1014
+ if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1015
+ cmp = frag2->yMax - frag1->yMax;
1016
+ }
1017
+ break;
1018
+ }
1019
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1020
+ }
1021
+
1022
+ int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
1023
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
1024
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
1025
+ double cmp;
1026
+
1027
+ cmp = 0; // make gcc happy
1028
+ switch (frag1->line->rot) {
1029
+ case 0:
1030
+ if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1031
+ cmp = frag1->yMin - frag2->yMin;
1032
+ }
1033
+ break;
1034
+ case 1:
1035
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1036
+ cmp = frag2->xMax - frag1->xMax;
1037
+ }
1038
+ break;
1039
+ case 2:
1040
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1041
+ cmp = frag2->yMin - frag1->yMin;
1042
+ }
1043
+ break;
1044
+ case 3:
1045
+ if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1046
+ cmp = frag1->xMax - frag2->xMax;
1047
+ }
1048
+ break;
1049
+ }
1050
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1051
+ }
1052
+
1053
+ int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2) {
1054
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
1055
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
1056
+ double cmp;
1057
+
1058
+ // if columns overlap, compare y values
1059
+ if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1060
+ frag2->line->col[frag2->start]) &&
1061
+ frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1062
+ frag1->line->col[frag1->start])) {
1063
+ cmp = 0; // make gcc happy
1064
+ switch (frag1->line->blk->page->primaryRot) {
1065
+ case 0: cmp = frag1->yMin - frag2->yMin; break;
1066
+ case 1: cmp = frag2->xMax - frag1->xMax; break;
1067
+ case 2: cmp = frag2->yMin - frag1->yMin; break;
1068
+ case 3: cmp = frag1->xMax - frag2->xMax; break;
1069
+ }
1070
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1071
+ }
1072
+
1073
+ // otherwise, compare starting column
1074
+ return frag1->col - frag2->col;
1075
+ }
1076
+
1077
+ int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2) {
1078
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
1079
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
1080
+ double cmp;
1081
+
1082
+ // if columns overlap, compare y values
1083
+ if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1084
+ frag2->line->col[frag2->start]) &&
1085
+ frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1086
+ frag1->line->col[frag1->start])) {
1087
+ cmp = 0; // make gcc happy
1088
+ switch (frag1->line->rot) {
1089
+ case 0: cmp = frag1->yMin - frag2->yMin; break;
1090
+ case 1: cmp = frag2->xMax - frag1->xMax; break;
1091
+ case 2: cmp = frag2->yMin - frag1->yMin; break;
1092
+ case 3: cmp = frag1->xMax - frag2->xMax; break;
1093
+ }
1094
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1095
+ }
1096
+
1097
+ // otherwise, compare starting column
1098
+ return frag1->col - frag2->col;
1099
+ }
1100
+
1101
+ //------------------------------------------------------------------------
1102
+ // TextBlock
1103
+ //------------------------------------------------------------------------
1104
+
1105
+ TextBlock::TextBlock(TextPage *pageA, int rotA) {
1106
+ page = pageA;
1107
+ rot = rotA;
1108
+ xMin = yMin = 0;
1109
+ xMax = yMax = -1;
1110
+ priMin = 0;
1111
+ priMax = page->pageWidth;
1112
+ pool = new TextPool();
1113
+ lines = NULL;
1114
+ curLine = NULL;
1115
+ next = NULL;
1116
+ stackNext = NULL;
1117
+ }
1118
+
1119
+ TextBlock::~TextBlock() {
1120
+ TextLine *line;
1121
+
1122
+ delete pool;
1123
+ while (lines) {
1124
+ line = lines;
1125
+ lines = lines->next;
1126
+ delete line;
1127
+ }
1128
+ }
1129
+
1130
+ void TextBlock::addWord(TextWord *word) {
1131
+ pool->addWord(word);
1132
+ if (xMin > xMax) {
1133
+ xMin = word->xMin;
1134
+ xMax = word->xMax;
1135
+ yMin = word->yMin;
1136
+ yMax = word->yMax;
1137
+ } else {
1138
+ if (word->xMin < xMin) {
1139
+ xMin = word->xMin;
1140
+ }
1141
+ if (word->xMax > xMax) {
1142
+ xMax = word->xMax;
1143
+ }
1144
+ if (word->yMin < yMin) {
1145
+ yMin = word->yMin;
1146
+ }
1147
+ if (word->yMax > yMax) {
1148
+ yMax = word->yMax;
1149
+ }
1150
+ }
1151
+ }
1152
+
1153
+ void TextBlock::coalesce(UnicodeMap *uMap) {
1154
+ TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1155
+ TextLine *line, *line0, *line1;
1156
+ int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1157
+ int baseIdx, bestWordBaseIdx, idx0, idx1;
1158
+ double minBase, maxBase;
1159
+ double fontSize, delta, priDelta, secDelta;
1160
+ TextLine **lineArray;
1161
+ GBool found;
1162
+ int col1, col2;
1163
+ int i, j, k;
1164
+
1165
+ // discard duplicated text (fake boldface, drop shadows)
1166
+ for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1167
+ word0 = pool->getPool(idx0);
1168
+ while (word0) {
1169
+ priDelta = dupMaxPriDelta * word0->fontSize;
1170
+ secDelta = dupMaxSecDelta * word0->fontSize;
1171
+ if (rot == 0 || rot == 3) {
1172
+ maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1173
+ } else {
1174
+ maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
1175
+ }
1176
+ found = gFalse;
1177
+ word1 = word2 = NULL; // make gcc happy
1178
+ for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1179
+ if (idx1 == idx0) {
1180
+ word1 = word0;
1181
+ word2 = word0->next;
1182
+ } else {
1183
+ word1 = NULL;
1184
+ word2 = pool->getPool(idx1);
1185
+ }
1186
+ for (; word2; word1 = word2, word2 = word2->next) {
1187
+ if (word2->len == word0->len &&
1188
+ !memcmp(word2->text, word0->text,
1189
+ word0->len * sizeof(Unicode))) {
1190
+ switch (rot) {
1191
+ case 0:
1192
+ case 2:
1193
+ found = fabs(word0->xMin - word2->xMin) < priDelta &&
1194
+ fabs(word0->xMax - word2->xMax) < priDelta &&
1195
+ fabs(word0->yMin - word2->yMin) < secDelta &&
1196
+ fabs(word0->yMax - word2->yMax) < secDelta;
1197
+ break;
1198
+ case 1:
1199
+ case 3:
1200
+ found = fabs(word0->xMin - word2->xMin) < secDelta &&
1201
+ fabs(word0->xMax - word2->xMax) < secDelta &&
1202
+ fabs(word0->yMin - word2->yMin) < priDelta &&
1203
+ fabs(word0->yMax - word2->yMax) < priDelta;
1204
+ break;
1205
+ }
1206
+ }
1207
+ if (found) {
1208
+ break;
1209
+ }
1210
+ }
1211
+ if (found) {
1212
+ break;
1213
+ }
1214
+ }
1215
+ if (found) {
1216
+ if (word1) {
1217
+ word1->next = word2->next;
1218
+ } else {
1219
+ pool->setPool(idx1, word2->next);
1220
+ }
1221
+ delete word2;
1222
+ } else {
1223
+ word0 = word0->next;
1224
+ }
1225
+ }
1226
+ }
1227
+
1228
+ // build the lines
1229
+ curLine = NULL;
1230
+ poolMinBaseIdx = pool->minBaseIdx;
1231
+ charCount = 0;
1232
+ nLines = 0;
1233
+ while (1) {
1234
+
1235
+ // find the first non-empty line in the pool
1236
+ for (;
1237
+ poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1238
+ ++poolMinBaseIdx) ;
1239
+ if (poolMinBaseIdx > pool->maxBaseIdx) {
1240
+ break;
1241
+ }
1242
+
1243
+ // look for the left-most word in the first four lines of the
1244
+ // pool -- this avoids starting with a superscript word
1245
+ startBaseIdx = poolMinBaseIdx;
1246
+ for (baseIdx = poolMinBaseIdx + 1;
1247
+ baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1248
+ ++baseIdx) {
1249
+ if (!pool->getPool(baseIdx)) {
1250
+ continue;
1251
+ }
1252
+ if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1253
+ < 0) {
1254
+ startBaseIdx = baseIdx;
1255
+ }
1256
+ }
1257
+
1258
+ // create a new line
1259
+ word0 = pool->getPool(startBaseIdx);
1260
+ pool->setPool(startBaseIdx, word0->next);
1261
+ word0->next = NULL;
1262
+ line = new TextLine(this, word0->rot, word0->base);
1263
+ line->addWord(word0);
1264
+ lastWord = word0;
1265
+
1266
+ // compute the search range
1267
+ fontSize = word0->fontSize;
1268
+ minBase = word0->base - maxIntraLineDelta * fontSize;
1269
+ maxBase = word0->base + maxIntraLineDelta * fontSize;
1270
+ minBaseIdx = pool->getBaseIdx(minBase);
1271
+ maxBaseIdx = pool->getBaseIdx(maxBase);
1272
+
1273
+ // find the rest of the words in this line
1274
+ while (1) {
1275
+
1276
+ // find the left-most word whose baseline is in the range for
1277
+ // this line
1278
+ bestWordBaseIdx = 0;
1279
+ bestWord0 = bestWord1 = NULL;
1280
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
1281
+ for (word0 = NULL, word1 = pool->getPool(baseIdx);
1282
+ word1;
1283
+ word0 = word1, word1 = word1->next) {
1284
+ if (word1->base >= minBase &&
1285
+ word1->base <= maxBase &&
1286
+ (delta = lastWord->primaryDelta(word1)) >=
1287
+ minCharSpacing * fontSize) {
1288
+ if (delta < maxWordSpacing * fontSize &&
1289
+ (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1290
+ bestWordBaseIdx = baseIdx;
1291
+ bestWord0 = word0;
1292
+ bestWord1 = word1;
1293
+ }
1294
+ break;
1295
+ }
1296
+ }
1297
+ }
1298
+ if (!bestWord1) {
1299
+ break;
1300
+ }
1301
+
1302
+ // remove it from the pool, and add it to the line
1303
+ if (bestWord0) {
1304
+ bestWord0->next = bestWord1->next;
1305
+ } else {
1306
+ pool->setPool(bestWordBaseIdx, bestWord1->next);
1307
+ }
1308
+ bestWord1->next = NULL;
1309
+ line->addWord(bestWord1);
1310
+ lastWord = bestWord1;
1311
+ }
1312
+
1313
+ // add the line
1314
+ if (curLine && line->cmpYX(curLine) > 0) {
1315
+ line0 = curLine;
1316
+ line1 = curLine->next;
1317
+ } else {
1318
+ line0 = NULL;
1319
+ line1 = lines;
1320
+ }
1321
+ for (;
1322
+ line1 && line->cmpYX(line1) > 0;
1323
+ line0 = line1, line1 = line1->next) ;
1324
+ if (line0) {
1325
+ line0->next = line;
1326
+ } else {
1327
+ lines = line;
1328
+ }
1329
+ line->next = line1;
1330
+ curLine = line;
1331
+ line->coalesce(uMap);
1332
+ charCount += line->len;
1333
+ ++nLines;
1334
+ }
1335
+
1336
+ // sort lines into xy order for column assignment
1337
+ lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1338
+ for (line = lines, i = 0; line; line = line->next, ++i) {
1339
+ lineArray[i] = line;
1340
+ }
1341
+ qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1342
+
1343
+ // column assignment
1344
+ nColumns = 0;
1345
+ for (i = 0; i < nLines; ++i) {
1346
+ line0 = lineArray[i];
1347
+ col1 = 0;
1348
+ for (j = 0; j < i; ++j) {
1349
+ line1 = lineArray[j];
1350
+ if (line1->primaryDelta(line0) >= 0) {
1351
+ col2 = line1->col[line1->len] + 1;
1352
+ } else {
1353
+ k = 0; // make gcc happy
1354
+ switch (rot) {
1355
+ case 0:
1356
+ for (k = 0;
1357
+ k < line1->len &&
1358
+ line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1359
+ ++k) ;
1360
+ break;
1361
+ case 1:
1362
+ for (k = 0;
1363
+ k < line1->len &&
1364
+ line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1365
+ ++k) ;
1366
+ break;
1367
+ case 2:
1368
+ for (k = 0;
1369
+ k < line1->len &&
1370
+ line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1371
+ ++k) ;
1372
+ break;
1373
+ case 3:
1374
+ for (k = 0;
1375
+ k < line1->len &&
1376
+ line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1377
+ ++k) ;
1378
+ break;
1379
+ }
1380
+ col2 = line1->col[k];
1381
+ }
1382
+ if (col2 > col1) {
1383
+ col1 = col2;
1384
+ }
1385
+ }
1386
+ for (k = 0; k <= line0->len; ++k) {
1387
+ line0->col[k] += col1;
1388
+ }
1389
+ if (line0->col[line0->len] > nColumns) {
1390
+ nColumns = line0->col[line0->len];
1391
+ }
1392
+ }
1393
+ gfree(lineArray);
1394
+ }
1395
+
1396
+ void TextBlock::updatePriMinMax(TextBlock *blk) {
1397
+ double newPriMin, newPriMax;
1398
+ GBool gotPriMin, gotPriMax;
1399
+
1400
+ gotPriMin = gotPriMax = gFalse;
1401
+ newPriMin = newPriMax = 0; // make gcc happy
1402
+ switch (page->primaryRot) {
1403
+ case 0:
1404
+ case 2:
1405
+ if (blk->yMin < yMax && blk->yMax > yMin) {
1406
+ if (blk->xMin < xMin) {
1407
+ newPriMin = blk->xMax;
1408
+ gotPriMin = gTrue;
1409
+ }
1410
+ if (blk->xMax > xMax) {
1411
+ newPriMax = blk->xMin;
1412
+ gotPriMax = gTrue;
1413
+ }
1414
+ }
1415
+ break;
1416
+ case 1:
1417
+ case 3:
1418
+ if (blk->xMin < xMax && blk->xMax > xMin) {
1419
+ if (blk->yMin < yMin) {
1420
+ newPriMin = blk->yMax;
1421
+ gotPriMin = gTrue;
1422
+ }
1423
+ if (blk->yMax > yMax) {
1424
+ newPriMax = blk->yMin;
1425
+ gotPriMax = gTrue;
1426
+ }
1427
+ }
1428
+ break;
1429
+ }
1430
+ if (gotPriMin) {
1431
+ if (newPriMin > xMin) {
1432
+ newPriMin = xMin;
1433
+ }
1434
+ if (newPriMin > priMin) {
1435
+ priMin = newPriMin;
1436
+ }
1437
+ }
1438
+ if (gotPriMax) {
1439
+ if (newPriMax < xMax) {
1440
+ newPriMax = xMax;
1441
+ }
1442
+ if (newPriMax < priMax) {
1443
+ priMax = newPriMax;
1444
+ }
1445
+ }
1446
+ }
1447
+
1448
+ int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1449
+ TextBlock *blk1 = *(TextBlock **)p1;
1450
+ TextBlock *blk2 = *(TextBlock **)p2;
1451
+ double cmp;
1452
+
1453
+ cmp = 0; // make gcc happy
1454
+ switch (blk1->page->primaryRot) {
1455
+ case 0:
1456
+ if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1457
+ cmp = blk1->yMin - blk2->yMin;
1458
+ }
1459
+ break;
1460
+ case 1:
1461
+ if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1462
+ cmp = blk2->xMax - blk1->xMax;
1463
+ }
1464
+ break;
1465
+ case 2:
1466
+ if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1467
+ cmp = blk2->yMin - blk1->yMin;
1468
+ }
1469
+ break;
1470
+ case 3:
1471
+ if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1472
+ cmp = blk1->xMax - blk2->xMax;
1473
+ }
1474
+ break;
1475
+ }
1476
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1477
+ }
1478
+
1479
+ int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1480
+ TextBlock *blk1 = *(TextBlock **)p1;
1481
+ TextBlock *blk2 = *(TextBlock **)p2;
1482
+ double cmp;
1483
+
1484
+ cmp = 0; // make gcc happy
1485
+ switch (blk1->page->primaryRot) {
1486
+ case 0:
1487
+ if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1488
+ cmp = blk1->xMin - blk2->xMin;
1489
+ }
1490
+ break;
1491
+ case 1:
1492
+ if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1493
+ cmp = blk1->yMin - blk2->yMin;
1494
+ }
1495
+ break;
1496
+ case 2:
1497
+ if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1498
+ cmp = blk2->xMax - blk1->xMax;
1499
+ }
1500
+ break;
1501
+ case 3:
1502
+ if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1503
+ cmp = blk2->yMax - blk1->yMax;
1504
+ }
1505
+ break;
1506
+ }
1507
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1508
+ }
1509
+
1510
+ int TextBlock::primaryCmp(TextBlock *blk) {
1511
+ double cmp;
1512
+
1513
+ cmp = 0; // make gcc happy
1514
+ switch (rot) {
1515
+ case 0:
1516
+ cmp = xMin - blk->xMin;
1517
+ break;
1518
+ case 1:
1519
+ cmp = yMin - blk->yMin;
1520
+ break;
1521
+ case 2:
1522
+ cmp = blk->xMax - xMax;
1523
+ break;
1524
+ case 3:
1525
+ cmp = blk->yMax - yMax;
1526
+ break;
1527
+ }
1528
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1529
+ }
1530
+
1531
+ double TextBlock::secondaryDelta(TextBlock *blk) {
1532
+ double delta;
1533
+
1534
+ delta = 0; // make gcc happy
1535
+ switch (rot) {
1536
+ case 0:
1537
+ delta = blk->yMin - yMax;
1538
+ break;
1539
+ case 1:
1540
+ delta = xMin - blk->xMax;
1541
+ break;
1542
+ case 2:
1543
+ delta = yMin - blk->yMax;
1544
+ break;
1545
+ case 3:
1546
+ delta = blk->xMin - xMax;
1547
+ break;
1548
+ }
1549
+ return delta;
1550
+ }
1551
+
1552
+ GBool TextBlock::isBelow(TextBlock *blk) {
1553
+ GBool below;
1554
+
1555
+ below = gFalse; // make gcc happy
1556
+ switch (page->primaryRot) {
1557
+ case 0:
1558
+ below = xMin >= blk->priMin && xMax <= blk->priMax &&
1559
+ yMin > blk->yMin;
1560
+ break;
1561
+ case 1:
1562
+ below = yMin >= blk->priMin && yMax <= blk->priMax &&
1563
+ xMax < blk->xMax;
1564
+ break;
1565
+ case 2:
1566
+ below = xMin >= blk->priMin && xMax <= blk->priMax &&
1567
+ yMax < blk->yMax;
1568
+ break;
1569
+ case 3:
1570
+ below = yMin >= blk->priMin && yMax <= blk->priMax &&
1571
+ xMin > blk->xMin;
1572
+ break;
1573
+ }
1574
+
1575
+ return below;
1576
+ }
1577
+
1578
+ //------------------------------------------------------------------------
1579
+ // TextFlow
1580
+ //------------------------------------------------------------------------
1581
+
1582
+ TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
1583
+ page = pageA;
1584
+ xMin = blk->xMin;
1585
+ xMax = blk->xMax;
1586
+ yMin = blk->yMin;
1587
+ yMax = blk->yMax;
1588
+ priMin = blk->priMin;
1589
+ priMax = blk->priMax;
1590
+ blocks = lastBlk = blk;
1591
+ next = NULL;
1592
+ }
1593
+
1594
+ TextFlow::~TextFlow() {
1595
+ TextBlock *blk;
1596
+
1597
+ while (blocks) {
1598
+ blk = blocks;
1599
+ blocks = blocks->next;
1600
+ delete blk;
1601
+ }
1602
+ }
1603
+
1604
+ void TextFlow::addBlock(TextBlock *blk) {
1605
+ if (lastBlk) {
1606
+ lastBlk->next = blk;
1607
+ } else {
1608
+ blocks = blk;
1609
+ }
1610
+ lastBlk = blk;
1611
+ if (blk->xMin < xMin) {
1612
+ xMin = blk->xMin;
1613
+ }
1614
+ if (blk->xMax > xMax) {
1615
+ xMax = blk->xMax;
1616
+ }
1617
+ if (blk->yMin < yMin) {
1618
+ yMin = blk->yMin;
1619
+ }
1620
+ if (blk->yMax > yMax) {
1621
+ yMax = blk->yMax;
1622
+ }
1623
+ }
1624
+
1625
+ GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
1626
+ GBool fits;
1627
+
1628
+ // lower blocks must use smaller fonts
1629
+ if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
1630
+ return gFalse;
1631
+ }
1632
+
1633
+ fits = gFalse; // make gcc happy
1634
+ switch (page->primaryRot) {
1635
+ case 0:
1636
+ fits = blk->xMin >= priMin && blk->xMax <= priMax;
1637
+ break;
1638
+ case 1:
1639
+ fits = blk->yMin >= priMin && blk->yMax <= priMax;
1640
+ break;
1641
+ case 2:
1642
+ fits = blk->xMin >= priMin && blk->xMax <= priMax;
1643
+ break;
1644
+ case 3:
1645
+ fits = blk->yMin >= priMin && blk->yMax <= priMax;
1646
+ break;
1647
+ }
1648
+ return fits;
1649
+ }
1650
+
1651
+ #if TEXTOUT_WORD_LIST
1652
+
1653
+ //------------------------------------------------------------------------
1654
+ // TextWordList
1655
+ //------------------------------------------------------------------------
1656
+
1657
+ TextWordList::TextWordList(TextPage *text, GBool physLayout) {
1658
+ TextFlow *flow;
1659
+ TextBlock *blk;
1660
+ TextLine *line;
1661
+ TextWord *word;
1662
+ TextWord **wordArray;
1663
+ int nWords, i;
1664
+
1665
+ words = new GList();
1666
+
1667
+ if (text->rawOrder) {
1668
+ for (word = text->rawWords; word; word = word->next) {
1669
+ words->append(word);
1670
+ }
1671
+
1672
+ } else if (physLayout) {
1673
+ // this is inefficient, but it's also the least useful of these
1674
+ // three cases
1675
+ nWords = 0;
1676
+ for (flow = text->flows; flow; flow = flow->next) {
1677
+ for (blk = flow->blocks; blk; blk = blk->next) {
1678
+ for (line = blk->lines; line; line = line->next) {
1679
+ for (word = line->words; word; word = word->next) {
1680
+ ++nWords;
1681
+ }
1682
+ }
1683
+ }
1684
+ }
1685
+ wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
1686
+ i = 0;
1687
+ for (flow = text->flows; flow; flow = flow->next) {
1688
+ for (blk = flow->blocks; blk; blk = blk->next) {
1689
+ for (line = blk->lines; line; line = line->next) {
1690
+ for (word = line->words; word; word = word->next) {
1691
+ wordArray[i++] = word;
1692
+ }
1693
+ }
1694
+ }
1695
+ }
1696
+ qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
1697
+ for (i = 0; i < nWords; ++i) {
1698
+ words->append(wordArray[i]);
1699
+ }
1700
+ gfree(wordArray);
1701
+
1702
+ } else {
1703
+ for (flow = text->flows; flow; flow = flow->next) {
1704
+ for (blk = flow->blocks; blk; blk = blk->next) {
1705
+ for (line = blk->lines; line; line = line->next) {
1706
+ for (word = line->words; word; word = word->next) {
1707
+ words->append(word);
1708
+ }
1709
+ }
1710
+ }
1711
+ }
1712
+ }
1713
+ }
1714
+
1715
+ TextWordList::~TextWordList() {
1716
+ delete words;
1717
+ }
1718
+
1719
+ int TextWordList::getLength() {
1720
+ return words->getLength();
1721
+ }
1722
+
1723
+ TextWord *TextWordList::get(int idx) {
1724
+ if (idx < 0 || idx >= words->getLength()) {
1725
+ return NULL;
1726
+ }
1727
+ return (TextWord *)words->get(idx);
1728
+ }
1729
+
1730
+ #endif // TEXTOUT_WORD_LIST
1731
+
1732
+ //------------------------------------------------------------------------
1733
+ // TextPage
1734
+ //------------------------------------------------------------------------
1735
+
1736
+ TextPage::TextPage(GBool rawOrderA) {
1737
+ int rot;
1738
+
1739
+ rawOrder = rawOrderA;
1740
+ curWord = NULL;
1741
+ charPos = 0;
1742
+ curFont = NULL;
1743
+ curFontSize = 0;
1744
+ nest = 0;
1745
+ nTinyChars = 0;
1746
+ lastCharOverlap = gFalse;
1747
+ if (!rawOrder) {
1748
+ for (rot = 0; rot < 4; ++rot) {
1749
+ pools[rot] = new TextPool();
1750
+ }
1751
+ }
1752
+ flows = NULL;
1753
+ blocks = NULL;
1754
+ rawWords = NULL;
1755
+ rawLastWord = NULL;
1756
+ fonts = new GList();
1757
+ lastFindXMin = lastFindYMin = 0;
1758
+ haveLastFind = gFalse;
1759
+ underlines = new GList();
1760
+ links = new GList();
1761
+ }
1762
+
1763
+ TextPage::~TextPage() {
1764
+ int rot;
1765
+
1766
+ clear();
1767
+ if (!rawOrder) {
1768
+ for (rot = 0; rot < 4; ++rot) {
1769
+ delete pools[rot];
1770
+ }
1771
+ }
1772
+ delete fonts;
1773
+ deleteGList(underlines, TextUnderline);
1774
+ deleteGList(links, TextLink);
1775
+ }
1776
+
1777
+ void TextPage::startPage(GfxState *state) {
1778
+ clear();
1779
+ if (state) {
1780
+ pageWidth = state->getPageWidth();
1781
+ pageHeight = state->getPageHeight();
1782
+ } else {
1783
+ pageWidth = pageHeight = 0;
1784
+ }
1785
+ }
1786
+
1787
+ void TextPage::endPage() {
1788
+ if (curWord) {
1789
+ endWord();
1790
+ }
1791
+ }
1792
+
1793
+ void TextPage::clear() {
1794
+ int rot;
1795
+ TextFlow *flow;
1796
+ TextWord *word;
1797
+
1798
+ if (curWord) {
1799
+ delete curWord;
1800
+ curWord = NULL;
1801
+ }
1802
+ if (rawOrder) {
1803
+ while (rawWords) {
1804
+ word = rawWords;
1805
+ rawWords = rawWords->next;
1806
+ delete word;
1807
+ }
1808
+ } else {
1809
+ for (rot = 0; rot < 4; ++rot) {
1810
+ delete pools[rot];
1811
+ }
1812
+ while (flows) {
1813
+ flow = flows;
1814
+ flows = flows->next;
1815
+ delete flow;
1816
+ }
1817
+ gfree(blocks);
1818
+ }
1819
+ deleteGList(fonts, TextFontInfo);
1820
+
1821
+ curWord = NULL;
1822
+ charPos = 0;
1823
+ curFont = NULL;
1824
+ curFontSize = 0;
1825
+ nest = 0;
1826
+ nTinyChars = 0;
1827
+ if (!rawOrder) {
1828
+ for (rot = 0; rot < 4; ++rot) {
1829
+ pools[rot] = new TextPool();
1830
+ }
1831
+ }
1832
+ flows = NULL;
1833
+ blocks = NULL;
1834
+ rawWords = NULL;
1835
+ rawLastWord = NULL;
1836
+ fonts = new GList();
1837
+ }
1838
+
1839
+ void TextPage::updateFont(GfxState *state) {
1840
+ GfxFont *gfxFont;
1841
+ double *fm;
1842
+ char *name;
1843
+ int code, mCode, letterCode, anyCode;
1844
+ double w;
1845
+ int i;
1846
+
1847
+ // get the font info object
1848
+ curFont = NULL;
1849
+ for (i = 0; i < fonts->getLength(); ++i) {
1850
+ curFont = (TextFontInfo *)fonts->get(i);
1851
+ if (curFont->matches(state)) {
1852
+ break;
1853
+ }
1854
+ curFont = NULL;
1855
+ }
1856
+ if (!curFont) {
1857
+ curFont = new TextFontInfo(state);
1858
+ fonts->append(curFont);
1859
+ }
1860
+
1861
+ // adjust the font size
1862
+ gfxFont = state->getFont();
1863
+ curFontSize = state->getTransformedFontSize();
1864
+ if (gfxFont && gfxFont->getType() == fontType3) {
1865
+ // This is a hack which makes it possible to deal with some Type 3
1866
+ // fonts. The problem is that it's impossible to know what the
1867
+ // base coordinate system used in the font is without actually
1868
+ // rendering the font. This code tries to guess by looking at the
1869
+ // width of the character 'm' (which breaks if the font is a
1870
+ // subset that doesn't contain 'm').
1871
+ mCode = letterCode = anyCode = -1;
1872
+ for (code = 0; code < 256; ++code) {
1873
+ name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1874
+ if (name && name[0] == 'm' && name[1] == '\0') {
1875
+ mCode = code;
1876
+ }
1877
+ if (letterCode < 0 && name && name[1] == '\0' &&
1878
+ ((name[0] >= 'A' && name[0] <= 'Z') ||
1879
+ (name[0] >= 'a' && name[0] <= 'z'))) {
1880
+ letterCode = code;
1881
+ }
1882
+ if (anyCode < 0 && name &&
1883
+ ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
1884
+ anyCode = code;
1885
+ }
1886
+ }
1887
+ if (mCode >= 0 &&
1888
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
1889
+ // 0.6 is a generic average 'm' width -- yes, this is a hack
1890
+ curFontSize *= w / 0.6;
1891
+ } else if (letterCode >= 0 &&
1892
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
1893
+ // even more of a hack: 0.5 is a generic letter width
1894
+ curFontSize *= w / 0.5;
1895
+ } else if (anyCode >= 0 &&
1896
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
1897
+ // better than nothing: 0.5 is a generic character width
1898
+ curFontSize *= w / 0.5;
1899
+ }
1900
+ fm = gfxFont->getFontMatrix();
1901
+ if (fm[0] != 0) {
1902
+ curFontSize *= fabs(fm[3] / fm[0]);
1903
+ }
1904
+ }
1905
+ }
1906
+
1907
+ void TextPage::beginWord(GfxState *state, double x0, double y0) {
1908
+ double *fontm;
1909
+ double m[4], m2[4];
1910
+ int rot;
1911
+
1912
+ // This check is needed because Type 3 characters can contain
1913
+ // text-drawing operations (when TextPage is being used via
1914
+ // {X,Win}SplashOutputDev rather than TextOutputDev).
1915
+ if (curWord) {
1916
+ ++nest;
1917
+ return;
1918
+ }
1919
+
1920
+ // compute the rotation
1921
+ state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
1922
+ if (state->getFont()->getType() == fontType3) {
1923
+ fontm = state->getFont()->getFontMatrix();
1924
+ m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
1925
+ m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
1926
+ m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
1927
+ m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
1928
+ m[0] = m2[0];
1929
+ m[1] = m2[1];
1930
+ m[2] = m2[2];
1931
+ m[3] = m2[3];
1932
+ }
1933
+ if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
1934
+ rot = (m[3] < 0) ? 0 : 2;
1935
+ } else {
1936
+ rot = (m[2] > 0) ? 1 : 3;
1937
+ }
1938
+
1939
+ curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
1940
+ }
1941
+
1942
+ void TextPage::addChar(GfxState *state, double x, double y,
1943
+ double dx, double dy,
1944
+ CharCode c, int nBytes, Unicode *u, int uLen) {
1945
+ double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
1946
+ GBool overlap;
1947
+ int i;
1948
+
1949
+ // subtract char and word spacing from the dx,dy values
1950
+ sp = state->getCharSpace();
1951
+ if (c == (CharCode)0x20) {
1952
+ sp += state->getWordSpace();
1953
+ }
1954
+ state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1955
+ dx -= dx2;
1956
+ dy -= dy2;
1957
+ state->transformDelta(dx, dy, &w1, &h1);
1958
+
1959
+ // throw away chars that aren't inside the page bounds
1960
+ // (and also do a sanity check on the character size)
1961
+ state->transform(x, y, &x1, &y1);
1962
+ if (x1 + w1 < 0 || x1 > pageWidth ||
1963
+ y1 + h1 < 0 || y1 > pageHeight ||
1964
+ w1 > pageWidth || h1 > pageHeight) {
1965
+ charPos += nBytes;
1966
+ return;
1967
+ }
1968
+
1969
+ // check the tiny chars limit
1970
+ if (!globalParams->getTextKeepTinyChars() &&
1971
+ fabs(w1) < 3 && fabs(h1) < 3) {
1972
+ if (++nTinyChars > 50000) {
1973
+ charPos += nBytes;
1974
+ return;
1975
+ }
1976
+ }
1977
+
1978
+ // break words at space character
1979
+ if (uLen == 1 && u[0] == (Unicode)0x20) {
1980
+ if (curWord) {
1981
+ ++curWord->charLen;
1982
+ }
1983
+ charPos += nBytes;
1984
+ endWord();
1985
+ return;
1986
+ }
1987
+
1988
+ // start a new word if:
1989
+ // (1) this character doesn't fall in the right place relative to
1990
+ // the end of the previous word (this places upper and lower
1991
+ // constraints on the position deltas along both the primary
1992
+ // and secondary axes), or
1993
+ // (2) this character overlaps the previous one (duplicated text), or
1994
+ // (3) the previous character was an overlap (we want each duplicated
1995
+ // character to be in a word by itself at this stage),
1996
+ // (4) the font size has changed
1997
+ if (curWord && curWord->len > 0) {
1998
+ base = sp = delta = 0; // make gcc happy
1999
+ switch (curWord->rot) {
2000
+ case 0:
2001
+ base = y1;
2002
+ sp = x1 - curWord->xMax;
2003
+ delta = x1 - curWord->edge[curWord->len - 1];
2004
+ break;
2005
+ case 1:
2006
+ base = x1;
2007
+ sp = y1 - curWord->yMax;
2008
+ delta = y1 - curWord->edge[curWord->len - 1];
2009
+ break;
2010
+ case 2:
2011
+ base = y1;
2012
+ sp = curWord->xMin - x1;
2013
+ delta = curWord->edge[curWord->len - 1] - x1;
2014
+ break;
2015
+ case 3:
2016
+ base = x1;
2017
+ sp = curWord->yMin - y1;
2018
+ delta = curWord->edge[curWord->len - 1] - y1;
2019
+ break;
2020
+ }
2021
+ overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
2022
+ fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2023
+ if (overlap || lastCharOverlap ||
2024
+ sp < -minDupBreakOverlap * curWord->fontSize ||
2025
+ sp > minWordBreakSpace * curWord->fontSize ||
2026
+ fabs(base - curWord->base) > 0.5 ||
2027
+ curFontSize != curWord->fontSize) {
2028
+ endWord();
2029
+ }
2030
+ lastCharOverlap = overlap;
2031
+ } else {
2032
+ lastCharOverlap = gFalse;
2033
+ }
2034
+
2035
+ if (uLen != 0) {
2036
+ // start a new word if needed
2037
+ if (!curWord) {
2038
+ beginWord(state, x, y);
2039
+ }
2040
+
2041
+ // page rotation and/or transform matrices can cause text to be
2042
+ // drawn in reverse order -- in this case, swap the begin/end
2043
+ // coordinates and break text into individual chars
2044
+ if ((curWord->rot == 0 && w1 < 0) ||
2045
+ (curWord->rot == 1 && h1 < 0) ||
2046
+ (curWord->rot == 2 && w1 > 0) ||
2047
+ (curWord->rot == 3 && h1 > 0)) {
2048
+ endWord();
2049
+ beginWord(state, x + dx, y + dy);
2050
+ x1 += w1;
2051
+ y1 += h1;
2052
+ w1 = -w1;
2053
+ h1 = -h1;
2054
+ }
2055
+
2056
+ // add the characters to the current word
2057
+ w1 /= uLen;
2058
+ h1 /= uLen;
2059
+ for (i = 0; i < uLen; ++i) {
2060
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
2061
+ }
2062
+ }
2063
+ if (curWord) {
2064
+ curWord->charLen += nBytes;
2065
+ }
2066
+ charPos += nBytes;
2067
+ }
2068
+
2069
+ void TextPage::endWord() {
2070
+ // This check is needed because Type 3 characters can contain
2071
+ // text-drawing operations (when TextPage is being used via
2072
+ // {X,Win}SplashOutputDev rather than TextOutputDev).
2073
+ if (nest > 0) {
2074
+ --nest;
2075
+ return;
2076
+ }
2077
+
2078
+ if (curWord) {
2079
+ addWord(curWord);
2080
+ curWord = NULL;
2081
+ }
2082
+ }
2083
+
2084
+ void TextPage::addWord(TextWord *word) {
2085
+ // throw away zero-length words -- they don't have valid xMin/xMax
2086
+ // values, and they're useless anyway
2087
+ if (word->len == 0) {
2088
+ delete word;
2089
+ return;
2090
+ }
2091
+
2092
+ if (rawOrder) {
2093
+ if (rawLastWord) {
2094
+ rawLastWord->next = word;
2095
+ } else {
2096
+ rawWords = word;
2097
+ }
2098
+ rawLastWord = word;
2099
+ } else {
2100
+ pools[word->rot]->addWord(word);
2101
+ }
2102
+ }
2103
+
2104
+ void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
2105
+ underlines->append(new TextUnderline(x0, y0, x1, y1));
2106
+ }
2107
+
2108
+ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, Link *link) {
2109
+ links->append(new TextLink(xMin, yMin, xMax, yMax, link));
2110
+ }
2111
+
2112
+ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
2113
+ UnicodeMap *uMap;
2114
+ TextPool *pool;
2115
+ TextWord *word0, *word1, *word2;
2116
+ TextLine *line;
2117
+ TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;
2118
+ TextBlock **blkArray;
2119
+ TextFlow *flow, *lastFlow;
2120
+ TextUnderline *underline;
2121
+ TextLink *link;
2122
+ int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2123
+ double minBase, maxBase, newMinBase, newMaxBase;
2124
+ double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2125
+ GBool found;
2126
+ int count[4];
2127
+ int lrCount;
2128
+ int firstBlkIdx, nBlocksLeft;
2129
+ int col1, col2;
2130
+ int i, j, n;
2131
+
2132
+ if (rawOrder) {
2133
+ primaryRot = 0;
2134
+ primaryLR = gTrue;
2135
+ return;
2136
+ }
2137
+
2138
+ uMap = globalParams->getTextEncoding();
2139
+ blkList = NULL;
2140
+ lastBlk = NULL;
2141
+ nBlocks = 0;
2142
+ primaryRot = -1;
2143
+
2144
+ #if 0 // for debugging
2145
+ printf("*** initial words ***\n");
2146
+ for (rot = 0; rot < 4; ++rot) {
2147
+ pool = pools[rot];
2148
+ for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2149
+ for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2150
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2151
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2152
+ word0->base, word0->fontSize, rot*90, word0->link);
2153
+ for (i = 0; i < word0->len; ++i) {
2154
+ fputc(word0->text[i] & 0xff, stdout);
2155
+ }
2156
+ printf("'\n");
2157
+ }
2158
+ }
2159
+ }
2160
+ printf("\n");
2161
+ #endif
2162
+
2163
+ #if 0 //~ for debugging
2164
+ for (i = 0; i < underlines->getLength(); ++i) {
2165
+ underline = (TextUnderline *)underlines->get(i);
2166
+ printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2167
+ underline->x0, underline->x1, underline->y0, underline->y1,
2168
+ underline->horiz);
2169
+ }
2170
+ #endif
2171
+
2172
+ if (doHTML) {
2173
+
2174
+ //----- handle underlining
2175
+ for (i = 0; i < underlines->getLength(); ++i) {
2176
+ underline = (TextUnderline *)underlines->get(i);
2177
+ if (underline->horiz) {
2178
+ // rot = 0
2179
+ if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2180
+ startBaseIdx = pools[0]->getBaseIdx(underline->y0 + minUnderlineGap);
2181
+ endBaseIdx = pools[0]->getBaseIdx(underline->y0 + maxUnderlineGap);
2182
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2183
+ for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2184
+ //~ need to check the y value against the word baseline
2185
+ if (underline->x0 < word0->xMin + underlineSlack &&
2186
+ word0->xMax - underlineSlack < underline->x1) {
2187
+ word0->underlined = gTrue;
2188
+ }
2189
+ }
2190
+ }
2191
+ }
2192
+
2193
+ // rot = 2
2194
+ if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2195
+ startBaseIdx = pools[2]->getBaseIdx(underline->y0 - maxUnderlineGap);
2196
+ endBaseIdx = pools[2]->getBaseIdx(underline->y0 - minUnderlineGap);
2197
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2198
+ for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2199
+ if (underline->x0 < word0->xMin + underlineSlack &&
2200
+ word0->xMax - underlineSlack < underline->x1) {
2201
+ word0->underlined = gTrue;
2202
+ }
2203
+ }
2204
+ }
2205
+ }
2206
+ } else {
2207
+ // rot = 1
2208
+ if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2209
+ startBaseIdx = pools[1]->getBaseIdx(underline->x0 - maxUnderlineGap);
2210
+ endBaseIdx = pools[1]->getBaseIdx(underline->x0 - minUnderlineGap);
2211
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2212
+ for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2213
+ if (underline->y0 < word0->yMin + underlineSlack &&
2214
+ word0->yMax - underlineSlack < underline->y1) {
2215
+ word0->underlined = gTrue;
2216
+ }
2217
+ }
2218
+ }
2219
+ }
2220
+
2221
+ // rot = 3
2222
+ if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2223
+ startBaseIdx = pools[3]->getBaseIdx(underline->x0 + minUnderlineGap);
2224
+ endBaseIdx = pools[3]->getBaseIdx(underline->x0 + maxUnderlineGap);
2225
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2226
+ for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2227
+ if (underline->y0 < word0->yMin + underlineSlack &&
2228
+ word0->yMax - underlineSlack < underline->y1) {
2229
+ word0->underlined = gTrue;
2230
+ }
2231
+ }
2232
+ }
2233
+ }
2234
+ }
2235
+ }
2236
+
2237
+ //----- handle links
2238
+ for (i = 0; i < links->getLength(); ++i) {
2239
+ link = (TextLink *)links->get(i);
2240
+
2241
+ // rot = 0
2242
+ if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2243
+ startBaseIdx = pools[0]->getBaseIdx(link->yMin);
2244
+ endBaseIdx = pools[0]->getBaseIdx(link->yMax);
2245
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2246
+ for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2247
+ if (link->xMin < word0->xMin + hyperlinkSlack &&
2248
+ word0->xMax - hyperlinkSlack < link->xMax &&
2249
+ link->yMin < word0->yMin + hyperlinkSlack &&
2250
+ word0->yMax - hyperlinkSlack < link->yMax) {
2251
+ word0->link = link->link;
2252
+ }
2253
+ }
2254
+ }
2255
+ }
2256
+
2257
+ // rot = 2
2258
+ if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2259
+ startBaseIdx = pools[2]->getBaseIdx(link->yMin);
2260
+ endBaseIdx = pools[2]->getBaseIdx(link->yMax);
2261
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2262
+ for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2263
+ if (link->xMin < word0->xMin + hyperlinkSlack &&
2264
+ word0->xMax - hyperlinkSlack < link->xMax &&
2265
+ link->yMin < word0->yMin + hyperlinkSlack &&
2266
+ word0->yMax - hyperlinkSlack < link->yMax) {
2267
+ word0->link = link->link;
2268
+ }
2269
+ }
2270
+ }
2271
+ }
2272
+
2273
+ // rot = 1
2274
+ if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2275
+ startBaseIdx = pools[1]->getBaseIdx(link->xMin);
2276
+ endBaseIdx = pools[1]->getBaseIdx(link->xMax);
2277
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2278
+ for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2279
+ if (link->yMin < word0->yMin + hyperlinkSlack &&
2280
+ word0->yMax - hyperlinkSlack < link->yMax &&
2281
+ link->xMin < word0->xMin + hyperlinkSlack &&
2282
+ word0->xMax - hyperlinkSlack < link->xMax) {
2283
+ word0->link = link->link;
2284
+ }
2285
+ }
2286
+ }
2287
+ }
2288
+
2289
+ // rot = 3
2290
+ if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2291
+ startBaseIdx = pools[3]->getBaseIdx(link->xMin);
2292
+ endBaseIdx = pools[3]->getBaseIdx(link->xMax);
2293
+ for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2294
+ for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2295
+ if (link->yMin < word0->yMin + hyperlinkSlack &&
2296
+ word0->yMax - hyperlinkSlack < link->yMax &&
2297
+ link->xMin < word0->xMin + hyperlinkSlack &&
2298
+ word0->xMax - hyperlinkSlack < link->xMax) {
2299
+ word0->link = link->link;
2300
+ }
2301
+ }
2302
+ }
2303
+ }
2304
+ }
2305
+ }
2306
+
2307
+ //----- assemble the blocks
2308
+
2309
+ //~ add an outer loop for writing mode (vertical text)
2310
+
2311
+ // build blocks for each rotation value
2312
+ for (rot = 0; rot < 4; ++rot) {
2313
+ pool = pools[rot];
2314
+ poolMinBaseIdx = pool->minBaseIdx;
2315
+ count[rot] = 0;
2316
+
2317
+ // add blocks until no more words are left
2318
+ while (1) {
2319
+
2320
+ // find the first non-empty line in the pool
2321
+ for (;
2322
+ poolMinBaseIdx <= pool->maxBaseIdx &&
2323
+ !pool->getPool(poolMinBaseIdx);
2324
+ ++poolMinBaseIdx) ;
2325
+ if (poolMinBaseIdx > pool->maxBaseIdx) {
2326
+ break;
2327
+ }
2328
+
2329
+ // look for the left-most word in the first four lines of the
2330
+ // pool -- this avoids starting with a superscript word
2331
+ startBaseIdx = poolMinBaseIdx;
2332
+ for (baseIdx = poolMinBaseIdx + 1;
2333
+ baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
2334
+ ++baseIdx) {
2335
+ if (!pool->getPool(baseIdx)) {
2336
+ continue;
2337
+ }
2338
+ if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
2339
+ < 0) {
2340
+ startBaseIdx = baseIdx;
2341
+ }
2342
+ }
2343
+
2344
+ // create a new block
2345
+ word0 = pool->getPool(startBaseIdx);
2346
+ pool->setPool(startBaseIdx, word0->next);
2347
+ word0->next = NULL;
2348
+ blk = new TextBlock(this, rot);
2349
+ blk->addWord(word0);
2350
+
2351
+ fontSize = word0->fontSize;
2352
+ minBase = maxBase = word0->base;
2353
+ colSpace1 = minColSpacing1 * fontSize;
2354
+ colSpace2 = minColSpacing2 * fontSize;
2355
+ lineSpace = maxLineSpacingDelta * fontSize;
2356
+ intraLineSpace = maxIntraLineDelta * fontSize;
2357
+
2358
+ // add words to the block
2359
+ do {
2360
+ found = gFalse;
2361
+
2362
+ // look for words on the line above the current top edge of
2363
+ // the block
2364
+ newMinBase = minBase;
2365
+ for (baseIdx = pool->getBaseIdx(minBase);
2366
+ baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2367
+ --baseIdx) {
2368
+ word0 = NULL;
2369
+ word1 = pool->getPool(baseIdx);
2370
+ while (word1) {
2371
+ if (word1->base < minBase &&
2372
+ word1->base >= minBase - lineSpace &&
2373
+ ((rot == 0 || rot == 2)
2374
+ ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2375
+ : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2376
+ fabs(word1->fontSize - fontSize) <
2377
+ maxBlockFontSizeDelta1 * fontSize) {
2378
+ word2 = word1;
2379
+ if (word0) {
2380
+ word0->next = word1->next;
2381
+ } else {
2382
+ pool->setPool(baseIdx, word1->next);
2383
+ }
2384
+ word1 = word1->next;
2385
+ word2->next = NULL;
2386
+ blk->addWord(word2);
2387
+ found = gTrue;
2388
+ newMinBase = word2->base;
2389
+ } else {
2390
+ word0 = word1;
2391
+ word1 = word1->next;
2392
+ }
2393
+ }
2394
+ }
2395
+ minBase = newMinBase;
2396
+
2397
+ // look for words on the line below the current bottom edge of
2398
+ // the block
2399
+ newMaxBase = maxBase;
2400
+ for (baseIdx = pool->getBaseIdx(maxBase);
2401
+ baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
2402
+ ++baseIdx) {
2403
+ word0 = NULL;
2404
+ word1 = pool->getPool(baseIdx);
2405
+ while (word1) {
2406
+ if (word1->base > maxBase &&
2407
+ word1->base <= maxBase + lineSpace &&
2408
+ ((rot == 0 || rot == 2)
2409
+ ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2410
+ : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2411
+ fabs(word1->fontSize - fontSize) <
2412
+ maxBlockFontSizeDelta1 * fontSize) {
2413
+ word2 = word1;
2414
+ if (word0) {
2415
+ word0->next = word1->next;
2416
+ } else {
2417
+ pool->setPool(baseIdx, word1->next);
2418
+ }
2419
+ word1 = word1->next;
2420
+ word2->next = NULL;
2421
+ blk->addWord(word2);
2422
+ found = gTrue;
2423
+ newMaxBase = word2->base;
2424
+ } else {
2425
+ word0 = word1;
2426
+ word1 = word1->next;
2427
+ }
2428
+ }
2429
+ }
2430
+ maxBase = newMaxBase;
2431
+
2432
+ // look for words that are on lines already in the block, and
2433
+ // that overlap the block horizontally
2434
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2435
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2436
+ ++baseIdx) {
2437
+ word0 = NULL;
2438
+ word1 = pool->getPool(baseIdx);
2439
+ while (word1) {
2440
+ if (word1->base >= minBase - intraLineSpace &&
2441
+ word1->base <= maxBase + intraLineSpace &&
2442
+ ((rot == 0 || rot == 2)
2443
+ ? (word1->xMin < blk->xMax + colSpace1 &&
2444
+ word1->xMax > blk->xMin - colSpace1)
2445
+ : (word1->yMin < blk->yMax + colSpace1 &&
2446
+ word1->yMax > blk->yMin - colSpace1)) &&
2447
+ fabs(word1->fontSize - fontSize) <
2448
+ maxBlockFontSizeDelta2 * fontSize) {
2449
+ word2 = word1;
2450
+ if (word0) {
2451
+ word0->next = word1->next;
2452
+ } else {
2453
+ pool->setPool(baseIdx, word1->next);
2454
+ }
2455
+ word1 = word1->next;
2456
+ word2->next = NULL;
2457
+ blk->addWord(word2);
2458
+ found = gTrue;
2459
+ } else {
2460
+ word0 = word1;
2461
+ word1 = word1->next;
2462
+ }
2463
+ }
2464
+ }
2465
+
2466
+ // only check for outlying words (the next two chunks of code)
2467
+ // if we didn't find anything else
2468
+ if (found) {
2469
+ continue;
2470
+ }
2471
+
2472
+ // scan down the left side of the block, looking for words
2473
+ // that are near (but not overlapping) the block; if there are
2474
+ // three or fewer, add them to the block
2475
+ n = 0;
2476
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2477
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2478
+ ++baseIdx) {
2479
+ word1 = pool->getPool(baseIdx);
2480
+ while (word1) {
2481
+ if (word1->base >= minBase - intraLineSpace &&
2482
+ word1->base <= maxBase + intraLineSpace &&
2483
+ ((rot == 0 || rot == 2)
2484
+ ? (word1->xMax <= blk->xMin &&
2485
+ word1->xMax > blk->xMin - colSpace2)
2486
+ : (word1->yMax <= blk->yMin &&
2487
+ word1->yMax > blk->yMin - colSpace2)) &&
2488
+ fabs(word1->fontSize - fontSize) <
2489
+ maxBlockFontSizeDelta3 * fontSize) {
2490
+ ++n;
2491
+ break;
2492
+ }
2493
+ word1 = word1->next;
2494
+ }
2495
+ }
2496
+ if (n > 0 && n <= 3) {
2497
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2498
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2499
+ ++baseIdx) {
2500
+ word0 = NULL;
2501
+ word1 = pool->getPool(baseIdx);
2502
+ while (word1) {
2503
+ if (word1->base >= minBase - intraLineSpace &&
2504
+ word1->base <= maxBase + intraLineSpace &&
2505
+ ((rot == 0 || rot == 2)
2506
+ ? (word1->xMax <= blk->xMin &&
2507
+ word1->xMax > blk->xMin - colSpace2)
2508
+ : (word1->yMax <= blk->yMin &&
2509
+ word1->yMax > blk->yMin - colSpace2)) &&
2510
+ fabs(word1->fontSize - fontSize) <
2511
+ maxBlockFontSizeDelta3 * fontSize) {
2512
+ word2 = word1;
2513
+ if (word0) {
2514
+ word0->next = word1->next;
2515
+ } else {
2516
+ pool->setPool(baseIdx, word1->next);
2517
+ }
2518
+ word1 = word1->next;
2519
+ word2->next = NULL;
2520
+ blk->addWord(word2);
2521
+ if (word2->base < minBase) {
2522
+ minBase = word2->base;
2523
+ } else if (word2->base > maxBase) {
2524
+ maxBase = word2->base;
2525
+ }
2526
+ found = gTrue;
2527
+ break;
2528
+ } else {
2529
+ word0 = word1;
2530
+ word1 = word1->next;
2531
+ }
2532
+ }
2533
+ }
2534
+ }
2535
+
2536
+ // scan down the right side of the block, looking for words
2537
+ // that are near (but not overlapping) the block; if there are
2538
+ // three or fewer, add them to the block
2539
+ n = 0;
2540
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2541
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2542
+ ++baseIdx) {
2543
+ word1 = pool->getPool(baseIdx);
2544
+ while (word1) {
2545
+ if (word1->base >= minBase - intraLineSpace &&
2546
+ word1->base <= maxBase + intraLineSpace &&
2547
+ ((rot == 0 || rot == 2)
2548
+ ? (word1->xMin >= blk->xMax &&
2549
+ word1->xMin < blk->xMax + colSpace2)
2550
+ : (word1->yMin >= blk->yMax &&
2551
+ word1->yMin < blk->yMax + colSpace2)) &&
2552
+ fabs(word1->fontSize - fontSize) <
2553
+ maxBlockFontSizeDelta3 * fontSize) {
2554
+ ++n;
2555
+ break;
2556
+ }
2557
+ word1 = word1->next;
2558
+ }
2559
+ }
2560
+ if (n > 0 && n <= 3) {
2561
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2562
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2563
+ ++baseIdx) {
2564
+ word0 = NULL;
2565
+ word1 = pool->getPool(baseIdx);
2566
+ while (word1) {
2567
+ if (word1->base >= minBase - intraLineSpace &&
2568
+ word1->base <= maxBase + intraLineSpace &&
2569
+ ((rot == 0 || rot == 2)
2570
+ ? (word1->xMin >= blk->xMax &&
2571
+ word1->xMin < blk->xMax + colSpace2)
2572
+ : (word1->yMin >= blk->yMax &&
2573
+ word1->yMin < blk->yMax + colSpace2)) &&
2574
+ fabs(word1->fontSize - fontSize) <
2575
+ maxBlockFontSizeDelta3 * fontSize) {
2576
+ word2 = word1;
2577
+ if (word0) {
2578
+ word0->next = word1->next;
2579
+ } else {
2580
+ pool->setPool(baseIdx, word1->next);
2581
+ }
2582
+ word1 = word1->next;
2583
+ word2->next = NULL;
2584
+ blk->addWord(word2);
2585
+ if (word2->base < minBase) {
2586
+ minBase = word2->base;
2587
+ } else if (word2->base > maxBase) {
2588
+ maxBase = word2->base;
2589
+ }
2590
+ found = gTrue;
2591
+ break;
2592
+ } else {
2593
+ word0 = word1;
2594
+ word1 = word1->next;
2595
+ }
2596
+ }
2597
+ }
2598
+ }
2599
+
2600
+ } while (found);
2601
+
2602
+ //~ need to compute the primary writing mode (horiz/vert) in
2603
+ //~ addition to primary rotation
2604
+
2605
+ // coalesce the block, and add it to the list
2606
+ blk->coalesce(uMap);
2607
+ if (lastBlk) {
2608
+ lastBlk->next = blk;
2609
+ } else {
2610
+ blkList = blk;
2611
+ }
2612
+ lastBlk = blk;
2613
+ count[rot] += blk->charCount;
2614
+ if (primaryRot < 0 || count[rot] > count[primaryRot]) {
2615
+ primaryRot = rot;
2616
+ }
2617
+ ++nBlocks;
2618
+ }
2619
+ }
2620
+
2621
+ #if 0 // for debugging
2622
+ printf("*** rotation ***\n");
2623
+ for (rot = 0; rot < 4; ++rot) {
2624
+ printf(" %d: %6d\n", rot, count[rot]);
2625
+ }
2626
+ printf(" primary rot = %d\n", primaryRot);
2627
+ printf("\n");
2628
+ #endif
2629
+
2630
+ #if 0 // for debugging
2631
+ printf("*** blocks ***\n");
2632
+ for (blk = blkList; blk; blk = blk->next) {
2633
+ printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2634
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
2635
+ for (line = blk->lines; line; line = line->next) {
2636
+ printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2637
+ line->xMin, line->xMax, line->yMin, line->yMax, line->base);
2638
+ for (word0 = line->words; word0; word0 = word0->next) {
2639
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2640
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2641
+ word0->base, word0->fontSize, word0->spaceAfter);
2642
+ for (i = 0; i < word0->len; ++i) {
2643
+ fputc(word0->text[i] & 0xff, stdout);
2644
+ }
2645
+ printf("'\n");
2646
+ }
2647
+ }
2648
+ }
2649
+ printf("\n");
2650
+ #endif
2651
+
2652
+ // determine the primary direction
2653
+ lrCount = 0;
2654
+ for (blk = blkList; blk; blk = blk->next) {
2655
+ for (line = blk->lines; line; line = line->next) {
2656
+ for (word0 = line->words; word0; word0 = word0->next) {
2657
+ for (i = 0; i < word0->len; ++i) {
2658
+ if (unicodeTypeL(word0->text[i])) {
2659
+ ++lrCount;
2660
+ } else if (unicodeTypeR(word0->text[i])) {
2661
+ --lrCount;
2662
+ }
2663
+ }
2664
+ }
2665
+ }
2666
+ }
2667
+ primaryLR = lrCount >= 0;
2668
+
2669
+ #if 0 // for debugging
2670
+ printf("*** direction ***\n");
2671
+ printf("lrCount = %d\n", lrCount);
2672
+ printf("primaryLR = %d\n", primaryLR);
2673
+ #endif
2674
+
2675
+ //----- column assignment
2676
+
2677
+ // sort blocks into xy order for column assignment
2678
+ blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2679
+ for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
2680
+ blocks[i] = blk;
2681
+ }
2682
+ qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
2683
+
2684
+ // column assignment
2685
+ for (i = 0; i < nBlocks; ++i) {
2686
+ blk0 = blocks[i];
2687
+ col1 = 0;
2688
+ for (j = 0; j < i; ++j) {
2689
+ blk1 = blocks[j];
2690
+ col2 = 0; // make gcc happy
2691
+ switch (primaryRot) {
2692
+ case 0:
2693
+ if (blk0->xMin > blk1->xMax) {
2694
+ col2 = blk1->col + blk1->nColumns + 3;
2695
+ } else if (blk1->xMax == blk1->xMin) {
2696
+ col2 = blk1->col;
2697
+ } else {
2698
+ col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
2699
+ (blk1->xMax - blk1->xMin)) *
2700
+ blk1->nColumns);
2701
+ }
2702
+ break;
2703
+ case 1:
2704
+ if (blk0->yMin > blk1->yMax) {
2705
+ col2 = blk1->col + blk1->nColumns + 3;
2706
+ } else if (blk1->yMax == blk1->yMin) {
2707
+ col2 = blk1->col;
2708
+ } else {
2709
+ col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
2710
+ (blk1->yMax - blk1->yMin)) *
2711
+ blk1->nColumns);
2712
+ }
2713
+ break;
2714
+ case 2:
2715
+ if (blk0->xMax < blk1->xMin) {
2716
+ col2 = blk1->col + blk1->nColumns + 3;
2717
+ } else if (blk1->xMin == blk1->xMax) {
2718
+ col2 = blk1->col;
2719
+ } else {
2720
+ col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
2721
+ (blk1->xMin - blk1->xMax)) *
2722
+ blk1->nColumns);
2723
+ }
2724
+ break;
2725
+ case 3:
2726
+ if (blk0->yMax < blk1->yMin) {
2727
+ col2 = blk1->col + blk1->nColumns + 3;
2728
+ } else if (blk1->yMin == blk1->yMax) {
2729
+ col2 = blk1->col;
2730
+ } else {
2731
+ col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
2732
+ (blk1->yMin - blk1->yMax)) *
2733
+ blk1->nColumns);
2734
+ }
2735
+ break;
2736
+ }
2737
+ if (col2 > col1) {
2738
+ col1 = col2;
2739
+ }
2740
+ }
2741
+ blk0->col = col1;
2742
+ for (line = blk0->lines; line; line = line->next) {
2743
+ for (j = 0; j <= line->len; ++j) {
2744
+ line->col[j] += col1;
2745
+ }
2746
+ }
2747
+ }
2748
+
2749
+ #if 0 // for debugging
2750
+ printf("*** blocks, after column assignment ***\n");
2751
+ for (blk = blkList; blk; blk = blk->next) {
2752
+ printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2753
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
2754
+ blk->nColumns);
2755
+ for (line = blk->lines; line; line = line->next) {
2756
+ printf(" line:\n");
2757
+ for (word0 = line->words; word0; word0 = word0->next) {
2758
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2759
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2760
+ word0->base, word0->fontSize, word0->spaceAfter);
2761
+ for (i = 0; i < word0->len; ++i) {
2762
+ fputc(word0->text[i] & 0xff, stdout);
2763
+ }
2764
+ printf("'\n");
2765
+ }
2766
+ }
2767
+ }
2768
+ printf("\n");
2769
+ #endif
2770
+
2771
+ //----- reading order sort
2772
+
2773
+ // sort blocks into yx order (in preparation for reading order sort)
2774
+ qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot);
2775
+
2776
+ // compute space on left and right sides of each block
2777
+ for (i = 0; i < nBlocks; ++i) {
2778
+ blk0 = blocks[i];
2779
+ for (j = 0; j < nBlocks; ++j) {
2780
+ blk1 = blocks[j];
2781
+ if (blk1 != blk0) {
2782
+ blk0->updatePriMinMax(blk1);
2783
+ }
2784
+ }
2785
+ }
2786
+
2787
+ #if 0 // for debugging
2788
+ printf("*** blocks, after yx sort ***\n");
2789
+ for (i = 0; i < nBlocks; ++i) {
2790
+ blk = blocks[i];
2791
+ printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
2792
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2793
+ blk->priMin, blk->priMax);
2794
+ for (line = blk->lines; line; line = line->next) {
2795
+ printf(" line:\n");
2796
+ for (word0 = line->words; word0; word0 = word0->next) {
2797
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2798
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2799
+ word0->base, word0->fontSize, word0->spaceAfter);
2800
+ for (j = 0; j < word0->len; ++j) {
2801
+ fputc(word0->text[j] & 0xff, stdout);
2802
+ }
2803
+ printf("'\n");
2804
+ }
2805
+ }
2806
+ }
2807
+ printf("\n");
2808
+ #endif
2809
+
2810
+ // build the flows
2811
+ //~ this needs to be adjusted for writing mode (vertical text)
2812
+ //~ this also needs to account for right-to-left column ordering
2813
+ blkArray = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2814
+ memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *));
2815
+ flows = lastFlow = NULL;
2816
+ firstBlkIdx = 0;
2817
+ nBlocksLeft = nBlocks;
2818
+ while (nBlocksLeft > 0) {
2819
+
2820
+ // find the upper-left-most block
2821
+ for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ;
2822
+ i = firstBlkIdx;
2823
+ blk = blkArray[i];
2824
+ for (j = firstBlkIdx + 1; j < nBlocks; ++j) {
2825
+ blk1 = blkArray[j];
2826
+ if (blk1) {
2827
+ if (blk && blk->secondaryDelta(blk1) > 0) {
2828
+ break;
2829
+ }
2830
+ if (blk1->primaryCmp(blk) < 0) {
2831
+ i = j;
2832
+ blk = blk1;
2833
+ }
2834
+ }
2835
+ }
2836
+ blkArray[i] = NULL;
2837
+ --nBlocksLeft;
2838
+ blk->next = NULL;
2839
+
2840
+ // create a new flow, starting with the upper-left-most block
2841
+ flow = new TextFlow(this, blk);
2842
+ if (lastFlow) {
2843
+ lastFlow->next = flow;
2844
+ } else {
2845
+ flows = flow;
2846
+ }
2847
+ lastFlow = flow;
2848
+ fontSize = blk->lines->words->fontSize;
2849
+
2850
+ // push the upper-left-most block on the stack
2851
+ blk->stackNext = NULL;
2852
+ blkStack = blk;
2853
+
2854
+ // find the other blocks in this flow
2855
+ while (blkStack) {
2856
+
2857
+ // find the upper-left-most block under (but within
2858
+ // maxBlockSpacing of) the top block on the stack
2859
+ blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize;
2860
+ blk = NULL;
2861
+ i = -1;
2862
+ for (j = firstBlkIdx; j < nBlocks; ++j) {
2863
+ blk1 = blkArray[j];
2864
+ if (blk1) {
2865
+ if (blkStack->secondaryDelta(blk1) > blkSpace) {
2866
+ break;
2867
+ }
2868
+ if (blk && blk->secondaryDelta(blk1) > 0) {
2869
+ break;
2870
+ }
2871
+ if (blk1->isBelow(blkStack) &&
2872
+ (!blk || blk1->primaryCmp(blk) < 0)) {
2873
+ i = j;
2874
+ blk = blk1;
2875
+ }
2876
+ }
2877
+ }
2878
+
2879
+ // if a suitable block was found, add it to the flow and push it
2880
+ // onto the stack
2881
+ if (blk && flow->blockFits(blk, blkStack)) {
2882
+ blkArray[i] = NULL;
2883
+ --nBlocksLeft;
2884
+ blk->next = NULL;
2885
+ flow->addBlock(blk);
2886
+ fontSize = blk->lines->words->fontSize;
2887
+ blk->stackNext = blkStack;
2888
+ blkStack = blk;
2889
+
2890
+ // otherwise (if there is no block under the top block or the
2891
+ // block is not suitable), pop the stack
2892
+ } else {
2893
+ blkStack = blkStack->stackNext;
2894
+ }
2895
+ }
2896
+ }
2897
+ gfree(blkArray);
2898
+
2899
+ #if 0 // for debugging
2900
+ printf("*** flows ***\n");
2901
+ for (flow = flows; flow; flow = flow->next) {
2902
+ printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
2903
+ flow->xMin, flow->xMax, flow->yMin, flow->yMax,
2904
+ flow->priMin, flow->priMax);
2905
+ for (blk = flow->blocks; blk; blk = blk->next) {
2906
+ printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
2907
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2908
+ blk->priMin, blk->priMax);
2909
+ for (line = blk->lines; line; line = line->next) {
2910
+ printf(" line:\n");
2911
+ for (word0 = line->words; word0; word0 = word0->next) {
2912
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2913
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2914
+ word0->base, word0->fontSize, word0->spaceAfter);
2915
+ for (i = 0; i < word0->len; ++i) {
2916
+ fputc(word0->text[i] & 0xff, stdout);
2917
+ }
2918
+ printf("'\n");
2919
+ }
2920
+ }
2921
+ }
2922
+ }
2923
+ printf("\n");
2924
+ #endif
2925
+
2926
+ if (uMap) {
2927
+ uMap->decRefCnt();
2928
+ }
2929
+ }
2930
+
2931
+ GBool TextPage::findText(Unicode *s, int len,
2932
+ GBool startAtTop, GBool stopAtBottom,
2933
+ GBool startAtLast, GBool stopAtLast,
2934
+ GBool caseSensitive, GBool backward,
2935
+ double *xMin, double *yMin,
2936
+ double *xMax, double *yMax) {
2937
+ TextBlock *blk;
2938
+ TextLine *line;
2939
+ Unicode *s2, *txt;
2940
+ Unicode *p;
2941
+ int txtSize, m, i, j, k;
2942
+ double xStart, yStart, xStop, yStop;
2943
+ double xMin0, yMin0, xMax0, yMax0;
2944
+ double xMin1, yMin1, xMax1, yMax1;
2945
+ GBool found;
2946
+
2947
+ //~ needs to handle right-to-left text
2948
+
2949
+ if (rawOrder) {
2950
+ return gFalse;
2951
+ }
2952
+
2953
+ // convert the search string to uppercase
2954
+ if (!caseSensitive) {
2955
+ s2 = (Unicode *)gmallocn(len, sizeof(Unicode));
2956
+ for (i = 0; i < len; ++i) {
2957
+ s2[i] = unicodeToUpper(s[i]);
2958
+ }
2959
+ } else {
2960
+ s2 = s;
2961
+ }
2962
+
2963
+ txt = NULL;
2964
+ txtSize = 0;
2965
+
2966
+ xStart = yStart = xStop = yStop = 0;
2967
+ if (startAtLast && haveLastFind) {
2968
+ xStart = lastFindXMin;
2969
+ yStart = lastFindYMin;
2970
+ } else if (!startAtTop) {
2971
+ xStart = *xMin;
2972
+ yStart = *yMin;
2973
+ }
2974
+ if (stopAtLast && haveLastFind) {
2975
+ xStop = lastFindXMin;
2976
+ yStop = lastFindYMin;
2977
+ } else if (!stopAtBottom) {
2978
+ xStop = *xMax;
2979
+ yStop = *yMax;
2980
+ }
2981
+
2982
+ found = gFalse;
2983
+ xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
2984
+ xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
2985
+
2986
+ for (i = backward ? nBlocks - 1 : 0;
2987
+ backward ? i >= 0 : i < nBlocks;
2988
+ i += backward ? -1 : 1) {
2989
+ blk = blocks[i];
2990
+
2991
+ // check: is the block above the top limit?
2992
+ if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
2993
+ continue;
2994
+ }
2995
+
2996
+ // check: is the block below the bottom limit?
2997
+ if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
2998
+ break;
2999
+ }
3000
+
3001
+ for (line = blk->lines; line; line = line->next) {
3002
+
3003
+ // check: is the line above the top limit?
3004
+ if (!startAtTop &&
3005
+ (backward ? line->yMin > yStart : line->yMin < yStart)) {
3006
+ continue;
3007
+ }
3008
+
3009
+ // check: is the line below the bottom limit?
3010
+ if (!stopAtBottom &&
3011
+ (backward ? line->yMin < yStop : line->yMin > yStop)) {
3012
+ continue;
3013
+ }
3014
+
3015
+ // convert the line to uppercase
3016
+ m = line->len;
3017
+ if (!caseSensitive) {
3018
+ if (m > txtSize) {
3019
+ txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3020
+ txtSize = m;
3021
+ }
3022
+ for (k = 0; k < m; ++k) {
3023
+ txt[k] = unicodeToUpper(line->text[k]);
3024
+ }
3025
+ } else {
3026
+ txt = line->text;
3027
+ }
3028
+
3029
+ // search each position in this line
3030
+ j = backward ? m - len : 0;
3031
+ p = txt + j;
3032
+ while (backward ? j >= 0 : j <= m - len) {
3033
+
3034
+ // compare the strings
3035
+ for (k = 0; k < len; ++k) {
3036
+ if (p[k] != s2[k]) {
3037
+ break;
3038
+ }
3039
+ }
3040
+
3041
+ // found it
3042
+ if (k == len) {
3043
+ switch (line->rot) {
3044
+ case 0:
3045
+ xMin1 = line->edge[j];
3046
+ xMax1 = line->edge[j + len];
3047
+ yMin1 = line->yMin;
3048
+ yMax1 = line->yMax;
3049
+ break;
3050
+ case 1:
3051
+ xMin1 = line->xMin;
3052
+ xMax1 = line->xMax;
3053
+ yMin1 = line->edge[j];
3054
+ yMax1 = line->edge[j + len];
3055
+ break;
3056
+ case 2:
3057
+ xMin1 = line->edge[j + len];
3058
+ xMax1 = line->edge[j];
3059
+ yMin1 = line->yMin;
3060
+ yMax1 = line->yMax;
3061
+ break;
3062
+ case 3:
3063
+ xMin1 = line->xMin;
3064
+ xMax1 = line->xMax;
3065
+ yMin1 = line->edge[j + len];
3066
+ yMax1 = line->edge[j];
3067
+ break;
3068
+ }
3069
+ if (backward) {
3070
+ if ((startAtTop ||
3071
+ yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
3072
+ (stopAtBottom ||
3073
+ yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
3074
+ if (!found ||
3075
+ yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
3076
+ xMin0 = xMin1;
3077
+ xMax0 = xMax1;
3078
+ yMin0 = yMin1;
3079
+ yMax0 = yMax1;
3080
+ found = gTrue;
3081
+ }
3082
+ }
3083
+ } else {
3084
+ if ((startAtTop ||
3085
+ yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
3086
+ (stopAtBottom ||
3087
+ yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
3088
+ if (!found ||
3089
+ yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
3090
+ xMin0 = xMin1;
3091
+ xMax0 = xMax1;
3092
+ yMin0 = yMin1;
3093
+ yMax0 = yMax1;
3094
+ found = gTrue;
3095
+ }
3096
+ }
3097
+ }
3098
+ }
3099
+ if (backward) {
3100
+ --j;
3101
+ --p;
3102
+ } else {
3103
+ ++j;
3104
+ ++p;
3105
+ }
3106
+ }
3107
+ }
3108
+ }
3109
+
3110
+ if (!caseSensitive) {
3111
+ gfree(s2);
3112
+ gfree(txt);
3113
+ }
3114
+
3115
+ if (found) {
3116
+ *xMin = xMin0;
3117
+ *xMax = xMax0;
3118
+ *yMin = yMin0;
3119
+ *yMax = yMax0;
3120
+ lastFindXMin = xMin0;
3121
+ lastFindYMin = yMin0;
3122
+ haveLastFind = gTrue;
3123
+ return gTrue;
3124
+ }
3125
+
3126
+ return gFalse;
3127
+ }
3128
+
3129
+ GString *TextPage::getText(double xMin, double yMin,
3130
+ double xMax, double yMax) {
3131
+ GString *s;
3132
+ UnicodeMap *uMap;
3133
+ GBool isUnicode;
3134
+ TextBlock *blk;
3135
+ TextLine *line;
3136
+ TextLineFrag *frags;
3137
+ int nFrags, fragsSize;
3138
+ TextLineFrag *frag;
3139
+ char space[8], eol[16];
3140
+ int spaceLen, eolLen;
3141
+ int lastRot;
3142
+ double x, y, delta;
3143
+ int col, idx0, idx1, i, j;
3144
+ GBool multiLine, oneRot;
3145
+
3146
+ s = new GString();
3147
+
3148
+ if (rawOrder) {
3149
+ return s;
3150
+ }
3151
+
3152
+ // get the output encoding
3153
+ if (!(uMap = globalParams->getTextEncoding())) {
3154
+ return s;
3155
+ }
3156
+ isUnicode = uMap->isUnicode();
3157
+ spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3158
+ eolLen = 0; // make gcc happy
3159
+ switch (globalParams->getTextEOL()) {
3160
+ case eolUnix:
3161
+ eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3162
+ break;
3163
+ case eolDOS:
3164
+ eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3165
+ eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3166
+ break;
3167
+ case eolMac:
3168
+ eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3169
+ break;
3170
+ }
3171
+
3172
+ //~ writing mode (horiz/vert)
3173
+
3174
+ // collect the line fragments that are in the rectangle
3175
+ fragsSize = 256;
3176
+ frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3177
+ nFrags = 0;
3178
+ lastRot = -1;
3179
+ oneRot = gTrue;
3180
+ for (i = 0; i < nBlocks; ++i) {
3181
+ blk = blocks[i];
3182
+ if (xMin < blk->xMax && blk->xMin < xMax &&
3183
+ yMin < blk->yMax && blk->yMin < yMax) {
3184
+ for (line = blk->lines; line; line = line->next) {
3185
+ if (xMin < line->xMax && line->xMin < xMax &&
3186
+ yMin < line->yMax && line->yMin < yMax) {
3187
+ idx0 = idx1 = -1;
3188
+ switch (line->rot) {
3189
+ case 0:
3190
+ y = 0.5 * (line->yMin + line->yMax);
3191
+ if (yMin < y && y < yMax) {
3192
+ j = 0;
3193
+ while (j < line->len) {
3194
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3195
+ idx0 = j;
3196
+ break;
3197
+ }
3198
+ ++j;
3199
+ }
3200
+ j = line->len - 1;
3201
+ while (j >= 0) {
3202
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3203
+ idx1 = j;
3204
+ break;
3205
+ }
3206
+ --j;
3207
+ }
3208
+ }
3209
+ break;
3210
+ case 1:
3211
+ x = 0.5 * (line->xMin + line->xMax);
3212
+ if (xMin < x && x < xMax) {
3213
+ j = 0;
3214
+ while (j < line->len) {
3215
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3216
+ idx0 = j;
3217
+ break;
3218
+ }
3219
+ ++j;
3220
+ }
3221
+ j = line->len - 1;
3222
+ while (j >= 0) {
3223
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3224
+ idx1 = j;
3225
+ break;
3226
+ }
3227
+ --j;
3228
+ }
3229
+ }
3230
+ break;
3231
+ case 2:
3232
+ y = 0.5 * (line->yMin + line->yMax);
3233
+ if (yMin < y && y < yMax) {
3234
+ j = 0;
3235
+ while (j < line->len) {
3236
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3237
+ idx0 = j;
3238
+ break;
3239
+ }
3240
+ ++j;
3241
+ }
3242
+ j = line->len - 1;
3243
+ while (j >= 0) {
3244
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3245
+ idx1 = j;
3246
+ break;
3247
+ }
3248
+ --j;
3249
+ }
3250
+ }
3251
+ break;
3252
+ case 3:
3253
+ x = 0.5 * (line->xMin + line->xMax);
3254
+ if (xMin < x && x < xMax) {
3255
+ j = 0;
3256
+ while (j < line->len) {
3257
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3258
+ idx0 = j;
3259
+ break;
3260
+ }
3261
+ ++j;
3262
+ }
3263
+ j = line->len - 1;
3264
+ while (j >= 0) {
3265
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3266
+ idx1 = j;
3267
+ break;
3268
+ }
3269
+ --j;
3270
+ }
3271
+ }
3272
+ break;
3273
+ }
3274
+ if (idx0 >= 0 && idx1 >= 0) {
3275
+ if (nFrags == fragsSize) {
3276
+ fragsSize *= 2;
3277
+ frags = (TextLineFrag *)
3278
+ greallocn(frags, fragsSize, sizeof(TextLineFrag));
3279
+ }
3280
+ frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
3281
+ ++nFrags;
3282
+ if (lastRot >= 0 && line->rot != lastRot) {
3283
+ oneRot = gFalse;
3284
+ }
3285
+ lastRot = line->rot;
3286
+ }
3287
+ }
3288
+ }
3289
+ }
3290
+ }
3291
+
3292
+ // sort the fragments and generate the string
3293
+ if (nFrags > 0) {
3294
+
3295
+ for (i = 0; i < nFrags; ++i) {
3296
+ frags[i].computeCoords(oneRot);
3297
+ }
3298
+ assignColumns(frags, nFrags, oneRot);
3299
+
3300
+ // if all lines in the region have the same rotation, use it;
3301
+ // otherwise, use the page's primary rotation
3302
+ if (oneRot) {
3303
+ qsort(frags, nFrags, sizeof(TextLineFrag),
3304
+ &TextLineFrag::cmpYXLineRot);
3305
+ } else {
3306
+ qsort(frags, nFrags, sizeof(TextLineFrag),
3307
+ &TextLineFrag::cmpYXPrimaryRot);
3308
+ }
3309
+ i = 0;
3310
+ while (i < nFrags) {
3311
+ delta = maxIntraLineDelta * frags[i].line->words->fontSize;
3312
+ for (j = i+1;
3313
+ j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
3314
+ ++j) ;
3315
+ qsort(frags + i, j - i, sizeof(TextLineFrag),
3316
+ oneRot ? &TextLineFrag::cmpXYColumnLineRot
3317
+ : &TextLineFrag::cmpXYColumnPrimaryRot);
3318
+ i = j;
3319
+ }
3320
+
3321
+ col = 0;
3322
+ multiLine = gFalse;
3323
+ for (i = 0; i < nFrags; ++i) {
3324
+ frag = &frags[i];
3325
+
3326
+ // insert a return
3327
+ if (frag->col < col ||
3328
+ (i > 0 && fabs(frag->base - frags[i-1].base) >
3329
+ maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
3330
+ s->append(eol, eolLen);
3331
+ col = 0;
3332
+ multiLine = gTrue;
3333
+ }
3334
+
3335
+ // column alignment
3336
+ for (; col < frag->col; ++col) {
3337
+ s->append(space, spaceLen);
3338
+ }
3339
+
3340
+ // get the fragment text
3341
+ col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3342
+ }
3343
+
3344
+ if (multiLine) {
3345
+ s->append(eol, eolLen);
3346
+ }
3347
+ }
3348
+
3349
+ gfree(frags);
3350
+ uMap->decRefCnt();
3351
+
3352
+ return s;
3353
+ }
3354
+
3355
+ GBool TextPage::findCharRange(int pos, int length,
3356
+ double *xMin, double *yMin,
3357
+ double *xMax, double *yMax) {
3358
+ TextBlock *blk;
3359
+ TextLine *line;
3360
+ TextWord *word;
3361
+ double xMin0, xMax0, yMin0, yMax0;
3362
+ double xMin1, xMax1, yMin1, yMax1;
3363
+ GBool first;
3364
+ int i, j0, j1;
3365
+
3366
+ if (rawOrder) {
3367
+ return gFalse;
3368
+ }
3369
+
3370
+ //~ this doesn't correctly handle:
3371
+ //~ - ranges split across multiple lines (the highlighted region
3372
+ //~ is the bounding box of all the parts of the range)
3373
+ //~ - cases where characters don't convert one-to-one into Unicode
3374
+ first = gTrue;
3375
+ xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3376
+ xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3377
+ for (i = 0; i < nBlocks; ++i) {
3378
+ blk = blocks[i];
3379
+ for (line = blk->lines; line; line = line->next) {
3380
+ for (word = line->words; word; word = word->next) {
3381
+ if (pos < word->charPos + word->charLen &&
3382
+ word->charPos < pos + length) {
3383
+ j0 = pos - word->charPos;
3384
+ if (j0 < 0) {
3385
+ j0 = 0;
3386
+ }
3387
+ j1 = pos + length - 1 - word->charPos;
3388
+ if (j1 >= word->len) {
3389
+ j1 = word->len - 1;
3390
+ }
3391
+ switch (line->rot) {
3392
+ case 0:
3393
+ xMin1 = word->edge[j0];
3394
+ xMax1 = word->edge[j1 + 1];
3395
+ yMin1 = word->yMin;
3396
+ yMax1 = word->yMax;
3397
+ break;
3398
+ case 1:
3399
+ xMin1 = word->xMin;
3400
+ xMax1 = word->xMax;
3401
+ yMin1 = word->edge[j0];
3402
+ yMax1 = word->edge[j1 + 1];
3403
+ break;
3404
+ case 2:
3405
+ xMin1 = word->edge[j1 + 1];
3406
+ xMax1 = word->edge[j0];
3407
+ yMin1 = word->yMin;
3408
+ yMax1 = word->yMax;
3409
+ break;
3410
+ case 3:
3411
+ xMin1 = word->xMin;
3412
+ xMax1 = word->xMax;
3413
+ yMin1 = word->edge[j1 + 1];
3414
+ yMax1 = word->edge[j0];
3415
+ break;
3416
+ }
3417
+ if (first || xMin1 < xMin0) {
3418
+ xMin0 = xMin1;
3419
+ }
3420
+ if (first || xMax1 > xMax0) {
3421
+ xMax0 = xMax1;
3422
+ }
3423
+ if (first || yMin1 < yMin0) {
3424
+ yMin0 = yMin1;
3425
+ }
3426
+ if (first || yMax1 > yMax0) {
3427
+ yMax0 = yMax1;
3428
+ }
3429
+ first = gFalse;
3430
+ }
3431
+ }
3432
+ }
3433
+ }
3434
+ if (!first) {
3435
+ *xMin = xMin0;
3436
+ *xMax = xMax0;
3437
+ *yMin = yMin0;
3438
+ *yMax = yMax0;
3439
+ return gTrue;
3440
+ }
3441
+ return gFalse;
3442
+ }
3443
+
3444
+ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
3445
+ GBool physLayout) {
3446
+ UnicodeMap *uMap;
3447
+ TextFlow *flow;
3448
+ TextBlock *blk;
3449
+ TextLine *line;
3450
+ TextLineFrag *frags;
3451
+ TextWord *word;
3452
+ int nFrags, fragsSize;
3453
+ TextLineFrag *frag;
3454
+ char space[8], eol[16], eop[8];
3455
+ int spaceLen, eolLen, eopLen;
3456
+ GBool pageBreaks;
3457
+ GString *s;
3458
+ double delta;
3459
+ int col, i, j, d, n;
3460
+
3461
+ // get the output encoding
3462
+ if (!(uMap = globalParams->getTextEncoding())) {
3463
+ return;
3464
+ }
3465
+ spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3466
+ eolLen = 0; // make gcc happy
3467
+ switch (globalParams->getTextEOL()) {
3468
+ case eolUnix:
3469
+ eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3470
+ break;
3471
+ case eolDOS:
3472
+ eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3473
+ eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3474
+ break;
3475
+ case eolMac:
3476
+ eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3477
+ break;
3478
+ }
3479
+ eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
3480
+ pageBreaks = globalParams->getTextPageBreaks();
3481
+
3482
+ //~ writing mode (horiz/vert)
3483
+
3484
+ // output the page in raw (content stream) order
3485
+ if (rawOrder) {
3486
+
3487
+ for (word = rawWords; word; word = word->next) {
3488
+ s = new GString();
3489
+ dumpFragment(word->text, word->len, uMap, s);
3490
+ (*outputFunc)(outputStream, s->getCString(), s->getLength());
3491
+ delete s;
3492
+ if (word->next &&
3493
+ fabs(word->next->base - word->base) <
3494
+ maxIntraLineDelta * word->fontSize) {
3495
+ if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
3496
+ (*outputFunc)(outputStream, space, spaceLen);
3497
+ }
3498
+ } else {
3499
+ (*outputFunc)(outputStream, eol, eolLen);
3500
+ }
3501
+ }
3502
+
3503
+ // output the page, maintaining the original physical layout
3504
+ } else if (physLayout) {
3505
+
3506
+ // collect the line fragments for the page and sort them
3507
+ fragsSize = 256;
3508
+ frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3509
+ nFrags = 0;
3510
+ for (i = 0; i < nBlocks; ++i) {
3511
+ blk = blocks[i];
3512
+ for (line = blk->lines; line; line = line->next) {
3513
+ if (nFrags == fragsSize) {
3514
+ fragsSize *= 2;
3515
+ frags = (TextLineFrag *)greallocn(frags,
3516
+ fragsSize, sizeof(TextLineFrag));
3517
+ }
3518
+ frags[nFrags].init(line, 0, line->len);
3519
+ frags[nFrags].computeCoords(gTrue);
3520
+ ++nFrags;
3521
+ }
3522
+ }
3523
+ qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
3524
+ i = 0;
3525
+ while (i < nFrags) {
3526
+ delta = maxIntraLineDelta * frags[i].line->words->fontSize;
3527
+ for (j = i+1;
3528
+ j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
3529
+ ++j) ;
3530
+ qsort(frags + i, j - i, sizeof(TextLineFrag),
3531
+ &TextLineFrag::cmpXYColumnPrimaryRot);
3532
+ i = j;
3533
+ }
3534
+
3535
+ #if 0 // for debugging
3536
+ printf("*** line fragments ***\n");
3537
+ for (i = 0; i < nFrags; ++i) {
3538
+ frag = &frags[i];
3539
+ printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
3540
+ frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
3541
+ for (n = 0; n < frag->len; ++n) {
3542
+ fputc(frag->line->text[frag->start + n] & 0xff, stdout);
3543
+ }
3544
+ printf("'\n");
3545
+ }
3546
+ printf("\n");
3547
+ #endif
3548
+
3549
+ // generate output
3550
+ col = 0;
3551
+ for (i = 0; i < nFrags; ++i) {
3552
+ frag = &frags[i];
3553
+
3554
+ // column alignment
3555
+ for (; col < frag->col; ++col) {
3556
+ (*outputFunc)(outputStream, space, spaceLen);
3557
+ }
3558
+
3559
+ // print the line
3560
+ s = new GString();
3561
+ col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3562
+ (*outputFunc)(outputStream, s->getCString(), s->getLength());
3563
+ delete s;
3564
+
3565
+ // print one or more returns if necessary
3566
+ if (i == nFrags - 1 ||
3567
+ frags[i+1].col < col ||
3568
+ fabs(frags[i+1].base - frag->base) >
3569
+ maxIntraLineDelta * frag->line->words->fontSize) {
3570
+ if (i < nFrags - 1) {
3571
+ d = (int)((frags[i+1].base - frag->base) /
3572
+ frag->line->words->fontSize);
3573
+ if (d < 1) {
3574
+ d = 1;
3575
+ } else if (d > 5) {
3576
+ d = 5;
3577
+ }
3578
+ } else {
3579
+ d = 1;
3580
+ }
3581
+ for (; d > 0; --d) {
3582
+ (*outputFunc)(outputStream, eol, eolLen);
3583
+ }
3584
+ col = 0;
3585
+ }
3586
+ }
3587
+
3588
+ gfree(frags);
3589
+
3590
+ // output the page, "undoing" the layout
3591
+ } else {
3592
+ for (flow = flows; flow; flow = flow->next) {
3593
+ for (blk = flow->blocks; blk; blk = blk->next) {
3594
+ for (line = blk->lines; line; line = line->next) {
3595
+ n = line->len;
3596
+ if (line->hyphenated && (line->next || blk->next)) {
3597
+ --n;
3598
+ }
3599
+ s = new GString();
3600
+ dumpFragment(line->text, n, uMap, s);
3601
+ (*outputFunc)(outputStream, s->getCString(), s->getLength());
3602
+ delete s;
3603
+ if (!line->hyphenated) {
3604
+ if (line->next) {
3605
+ (*outputFunc)(outputStream, space, spaceLen);
3606
+ } else if (blk->next) {
3607
+ //~ this is a bit of a kludge - we should really do a more
3608
+ //~ intelligent determination of paragraphs
3609
+ if (blk->next->lines->words->fontSize ==
3610
+ blk->lines->words->fontSize) {
3611
+ (*outputFunc)(outputStream, space, spaceLen);
3612
+ } else {
3613
+ (*outputFunc)(outputStream, eol, eolLen);
3614
+ }
3615
+ }
3616
+ }
3617
+ }
3618
+ }
3619
+ (*outputFunc)(outputStream, eol, eolLen);
3620
+ (*outputFunc)(outputStream, eol, eolLen);
3621
+ }
3622
+ }
3623
+
3624
+ // end of page
3625
+ if (pageBreaks) {
3626
+ (*outputFunc)(outputStream, eop, eopLen);
3627
+ }
3628
+
3629
+ uMap->decRefCnt();
3630
+ }
3631
+
3632
+ void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
3633
+ TextLineFrag *frag0, *frag1;
3634
+ int rot, col1, col2, i, j, k;
3635
+
3636
+ // all text in the region has the same rotation -- recompute the
3637
+ // column numbers based only on the text in the region
3638
+ if (oneRot) {
3639
+ qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
3640
+ rot = frags[0].line->rot;
3641
+ for (i = 0; i < nFrags; ++i) {
3642
+ frag0 = &frags[i];
3643
+ col1 = 0;
3644
+ for (j = 0; j < i; ++j) {
3645
+ frag1 = &frags[j];
3646
+ col2 = 0; // make gcc happy
3647
+ switch (rot) {
3648
+ case 0:
3649
+ if (frag0->xMin >= frag1->xMax) {
3650
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3651
+ frag1->line->col[frag1->start]) + 1;
3652
+ } else {
3653
+ for (k = frag1->start;
3654
+ k < frag1->start + frag1->len &&
3655
+ frag0->xMin >= 0.5 * (frag1->line->edge[k] +
3656
+ frag1->line->edge[k+1]);
3657
+ ++k) ;
3658
+ col2 = frag1->col +
3659
+ frag1->line->col[k] - frag1->line->col[frag1->start];
3660
+ }
3661
+ break;
3662
+ case 1:
3663
+ if (frag0->yMin >= frag1->yMax) {
3664
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3665
+ frag1->line->col[frag1->start]) + 1;
3666
+ } else {
3667
+ for (k = frag1->start;
3668
+ k < frag1->start + frag1->len &&
3669
+ frag0->yMin >= 0.5 * (frag1->line->edge[k] +
3670
+ frag1->line->edge[k+1]);
3671
+ ++k) ;
3672
+ col2 = frag1->col +
3673
+ frag1->line->col[k] - frag1->line->col[frag1->start];
3674
+ }
3675
+ break;
3676
+ case 2:
3677
+ if (frag0->xMax <= frag1->xMin) {
3678
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3679
+ frag1->line->col[frag1->start]) + 1;
3680
+ } else {
3681
+ for (k = frag1->start;
3682
+ k < frag1->start + frag1->len &&
3683
+ frag0->xMax <= 0.5 * (frag1->line->edge[k] +
3684
+ frag1->line->edge[k+1]);
3685
+ ++k) ;
3686
+ col2 = frag1->col +
3687
+ frag1->line->col[k] - frag1->line->col[frag1->start];
3688
+ }
3689
+ break;
3690
+ case 3:
3691
+ if (frag0->yMax <= frag1->yMin) {
3692
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3693
+ frag1->line->col[frag1->start]) + 1;
3694
+ } else {
3695
+ for (k = frag1->start;
3696
+ k < frag1->start + frag1->len &&
3697
+ frag0->yMax <= 0.5 * (frag1->line->edge[k] +
3698
+ frag1->line->edge[k+1]);
3699
+ ++k) ;
3700
+ col2 = frag1->col +
3701
+ frag1->line->col[k] - frag1->line->col[frag1->start];
3702
+ }
3703
+ break;
3704
+ }
3705
+ if (col2 > col1) {
3706
+ col1 = col2;
3707
+ }
3708
+ }
3709
+ frag0->col = col1;
3710
+ }
3711
+
3712
+ // the region includes text at different rotations -- use the
3713
+ // globally assigned column numbers, offset by the minimum column
3714
+ // number (i.e., shift everything over to column 0)
3715
+ } else {
3716
+ col1 = frags[0].col;
3717
+ for (i = 1; i < nFrags; ++i) {
3718
+ if (frags[i].col < col1) {
3719
+ col1 = frags[i].col;
3720
+ }
3721
+ }
3722
+ for (i = 0; i < nFrags; ++i) {
3723
+ frags[i].col -= col1;
3724
+ }
3725
+ }
3726
+ }
3727
+
3728
+ int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
3729
+ GString *s) {
3730
+ char lre[8], rle[8], popdf[8], buf[8];
3731
+ int lreLen, rleLen, popdfLen, n;
3732
+ int nCols, i, j, k;
3733
+
3734
+ nCols = 0;
3735
+
3736
+ if (uMap->isUnicode()) {
3737
+
3738
+ lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
3739
+ rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
3740
+ popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
3741
+
3742
+ if (primaryLR) {
3743
+
3744
+ i = 0;
3745
+ while (i < len) {
3746
+ // output a left-to-right section
3747
+ for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
3748
+ for (k = i; k < j; ++k) {
3749
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3750
+ s->append(buf, n);
3751
+ ++nCols;
3752
+ }
3753
+ i = j;
3754
+ // output a right-to-left section
3755
+ for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
3756
+ if (j > i) {
3757
+ s->append(rle, rleLen);
3758
+ for (k = j - 1; k >= i; --k) {
3759
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3760
+ s->append(buf, n);
3761
+ ++nCols;
3762
+ }
3763
+ s->append(popdf, popdfLen);
3764
+ i = j;
3765
+ }
3766
+ }
3767
+
3768
+ } else {
3769
+
3770
+ s->append(rle, rleLen);
3771
+ i = len - 1;
3772
+ while (i >= 0) {
3773
+ // output a right-to-left section
3774
+ for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
3775
+ for (k = i; k > j; --k) {
3776
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3777
+ s->append(buf, n);
3778
+ ++nCols;
3779
+ }
3780
+ i = j;
3781
+ // output a left-to-right section
3782
+ for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
3783
+ if (j < i) {
3784
+ s->append(lre, lreLen);
3785
+ for (k = j + 1; k <= i; ++k) {
3786
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3787
+ s->append(buf, n);
3788
+ ++nCols;
3789
+ }
3790
+ s->append(popdf, popdfLen);
3791
+ i = j;
3792
+ }
3793
+ }
3794
+ s->append(popdf, popdfLen);
3795
+
3796
+ }
3797
+
3798
+ } else {
3799
+ for (i = 0; i < len; ++i) {
3800
+ n = uMap->mapUnicode(text[i], buf, sizeof(buf));
3801
+ s->append(buf, n);
3802
+ nCols += n;
3803
+ }
3804
+ }
3805
+
3806
+ return nCols;
3807
+ }
3808
+
3809
+ #if TEXTOUT_WORD_LIST
3810
+ TextWordList *TextPage::makeWordList(GBool physLayout) {
3811
+ return new TextWordList(this, physLayout);
3812
+ }
3813
+ #endif
3814
+
3815
+ //------------------------------------------------------------------------
3816
+ // TextOutputDev
3817
+ //------------------------------------------------------------------------
3818
+
3819
+ static void outputToFile(void *stream, char *text, int len) {
3820
+ fwrite(text, 1, len, (FILE *)stream);
3821
+ }
3822
+
3823
+ TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
3824
+ GBool rawOrderA, GBool append) {
3825
+ text = NULL;
3826
+ physLayout = physLayoutA;
3827
+ rawOrder = rawOrderA;
3828
+ doHTML = gFalse;
3829
+ ok = gTrue;
3830
+
3831
+ // open file
3832
+ needClose = gFalse;
3833
+ if (fileName) {
3834
+ if (!strcmp(fileName, "-")) {
3835
+ outputStream = stdout;
3836
+ #ifdef WIN32
3837
+ // keep DOS from munging the end-of-line characters
3838
+ setmode(fileno(stdout), O_BINARY);
3839
+ #endif
3840
+ } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
3841
+ needClose = gTrue;
3842
+ } else {
3843
+ error(-1, "Couldn't open text file '%s'", fileName);
3844
+ ok = gFalse;
3845
+ return;
3846
+ }
3847
+ outputFunc = &outputToFile;
3848
+ } else {
3849
+ outputStream = NULL;
3850
+ }
3851
+
3852
+ // set up text object
3853
+ text = new TextPage(rawOrderA);
3854
+ }
3855
+
3856
+ TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
3857
+ GBool physLayoutA, GBool rawOrderA) {
3858
+ outputFunc = func;
3859
+ outputStream = stream;
3860
+ needClose = gFalse;
3861
+ physLayout = physLayoutA;
3862
+ rawOrder = rawOrderA;
3863
+ doHTML = gFalse;
3864
+ text = new TextPage(rawOrderA);
3865
+ ok = gTrue;
3866
+ }
3867
+
3868
+ TextOutputDev::~TextOutputDev() {
3869
+ if (needClose) {
3870
+ #ifdef MACOS
3871
+ ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
3872
+ #endif
3873
+ fclose((FILE *)outputStream);
3874
+ }
3875
+ if (text) {
3876
+ delete text;
3877
+ }
3878
+ }
3879
+
3880
+ void TextOutputDev::startPage(int pageNum, GfxState *state) {
3881
+ text->startPage(state);
3882
+ }
3883
+
3884
+ void TextOutputDev::endPage() {
3885
+ text->endPage();
3886
+ text->coalesce(physLayout, doHTML);
3887
+ if (outputStream) {
3888
+ text->dump(outputStream, outputFunc, physLayout);
3889
+ }
3890
+ }
3891
+
3892
+ void TextOutputDev::updateFont(GfxState *state) {
3893
+ text->updateFont(state);
3894
+ }
3895
+
3896
+ void TextOutputDev::beginString(GfxState *state, GString *s) {
3897
+ }
3898
+
3899
+ void TextOutputDev::endString(GfxState *state) {
3900
+ }
3901
+
3902
+ void TextOutputDev::drawChar(GfxState *state, double x, double y,
3903
+ double dx, double dy,
3904
+ double originX, double originY,
3905
+ CharCode c, int nBytes, Unicode *u, int uLen) {
3906
+ text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
3907
+ }
3908
+
3909
+ void TextOutputDev::stroke(GfxState *state) {
3910
+ GfxPath *path;
3911
+ GfxSubpath *subpath;
3912
+ double x[2], y[2];
3913
+
3914
+ if (!doHTML) {
3915
+ return;
3916
+ }
3917
+ path = state->getPath();
3918
+ if (path->getNumSubpaths() != 1) {
3919
+ return;
3920
+ }
3921
+ subpath = path->getSubpath(0);
3922
+ if (subpath->getNumPoints() != 2) {
3923
+ return;
3924
+ }
3925
+ state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
3926
+ state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
3927
+
3928
+ // look for a vertical or horizontal line
3929
+ if (x[0] == x[1] || y[0] == y[1]) {
3930
+ text->addUnderline(x[0], y[0], x[1], y[1]);
3931
+ }
3932
+ }
3933
+
3934
+ void TextOutputDev::fill(GfxState *state) {
3935
+ GfxPath *path;
3936
+ GfxSubpath *subpath;
3937
+ double x[5], y[5];
3938
+ double rx0, ry0, rx1, ry1, t;
3939
+ int i;
3940
+
3941
+ if (!doHTML) {
3942
+ return;
3943
+ }
3944
+ path = state->getPath();
3945
+ if (path->getNumSubpaths() != 1) {
3946
+ return;
3947
+ }
3948
+ subpath = path->getSubpath(0);
3949
+ if (subpath->getNumPoints() != 5) {
3950
+ return;
3951
+ }
3952
+ for (i = 0; i < 5; ++i) {
3953
+ if (subpath->getCurve(i)) {
3954
+ return;
3955
+ }
3956
+ state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
3957
+ }
3958
+
3959
+ // look for a rectangle
3960
+ if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] &&
3961
+ x[0] == x[4] && y[0] == y[4]) {
3962
+ rx0 = x[0];
3963
+ ry0 = y[0];
3964
+ rx1 = x[2];
3965
+ ry1 = y[1];
3966
+ } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] &&
3967
+ x[0] == x[4] && y[0] == y[4]) {
3968
+ rx0 = x[0];
3969
+ ry0 = y[0];
3970
+ rx1 = x[1];
3971
+ ry1 = y[2];
3972
+ } else {
3973
+ return;
3974
+ }
3975
+ if (rx1 < rx0) {
3976
+ t = rx0;
3977
+ rx0 = rx1;
3978
+ rx1 = t;
3979
+ }
3980
+ if (ry1 < ry0) {
3981
+ t = ry0;
3982
+ ry0 = ry1;
3983
+ ry1 = t;
3984
+ }
3985
+
3986
+ // skinny horizontal rectangle
3987
+ if (ry1 - ry0 < rx1 - rx0) {
3988
+ if (ry1 - ry0 < maxUnderlineWidth) {
3989
+ ry0 = 0.5 * (ry0 + ry1);
3990
+ text->addUnderline(rx0, ry0, rx1, ry0);
3991
+ }
3992
+
3993
+ // skinny vertical rectangle
3994
+ } else {
3995
+ if (rx1 - rx0 < maxUnderlineWidth) {
3996
+ rx0 = 0.5 * (rx0 + rx1);
3997
+ text->addUnderline(rx0, ry0, rx0, ry1);
3998
+ }
3999
+ }
4000
+ }
4001
+
4002
+ void TextOutputDev::eoFill(GfxState *state) {
4003
+ if (!doHTML) {
4004
+ return;
4005
+ }
4006
+ fill(state);
4007
+ }
4008
+
4009
+ void TextOutputDev::processLink(Link *link, Catalog *catalog) {
4010
+ double x1, y1, x2, y2;
4011
+ int xMin, yMin, xMax, yMax, x, y;
4012
+
4013
+ if (!doHTML) {
4014
+ return;
4015
+ }
4016
+ link->getRect(&x1, &y1, &x2, &y2);
4017
+ cvtUserToDev(x1, y1, &x, &y);
4018
+ xMin = xMax = x;
4019
+ yMin = yMax = y;
4020
+ cvtUserToDev(x1, y2, &x, &y);
4021
+ if (x < xMin) {
4022
+ xMin = x;
4023
+ } else if (x > xMax) {
4024
+ xMax = x;
4025
+ }
4026
+ if (y < yMin) {
4027
+ yMin = y;
4028
+ } else if (y > yMax) {
4029
+ yMax = y;
4030
+ }
4031
+ cvtUserToDev(x2, y1, &x, &y);
4032
+ if (x < xMin) {
4033
+ xMin = x;
4034
+ } else if (x > xMax) {
4035
+ xMax = x;
4036
+ }
4037
+ if (y < yMin) {
4038
+ yMin = y;
4039
+ } else if (y > yMax) {
4040
+ yMax = y;
4041
+ }
4042
+ cvtUserToDev(x2, y2, &x, &y);
4043
+ if (x < xMin) {
4044
+ xMin = x;
4045
+ } else if (x > xMax) {
4046
+ xMax = x;
4047
+ }
4048
+ if (y < yMin) {
4049
+ yMin = y;
4050
+ } else if (y > yMax) {
4051
+ yMax = y;
4052
+ }
4053
+ text->addLink(xMin, yMin, xMax, yMax, link);
4054
+ }
4055
+
4056
+ GBool TextOutputDev::findText(Unicode *s, int len,
4057
+ GBool startAtTop, GBool stopAtBottom,
4058
+ GBool startAtLast, GBool stopAtLast,
4059
+ GBool caseSensitive, GBool backward,
4060
+ double *xMin, double *yMin,
4061
+ double *xMax, double *yMax) {
4062
+ return text->findText(s, len, startAtTop, stopAtBottom,
4063
+ startAtLast, stopAtLast, caseSensitive, backward,
4064
+ xMin, yMin, xMax, yMax);
4065
+ }
4066
+
4067
+ GString *TextOutputDev::getText(double xMin, double yMin,
4068
+ double xMax, double yMax) {
4069
+ return text->getText(xMin, yMin, xMax, yMax);
4070
+ }
4071
+
4072
+ GBool TextOutputDev::findCharRange(int pos, int length,
4073
+ double *xMin, double *yMin,
4074
+ double *xMax, double *yMax) {
4075
+ return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
4076
+ }
4077
+
4078
+ #if TEXTOUT_WORD_LIST
4079
+ TextWordList *TextOutputDev::makeWordList() {
4080
+ return text->makeWordList(physLayout);
4081
+ }
4082
+ #endif
4083
+
4084
+ TextPage *TextOutputDev::takeText() {
4085
+ TextPage *ret;
4086
+
4087
+ ret = text;
4088
+ text = new TextPage(rawOrder);
4089
+ return ret;
4090
+ }