objective-ci 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. data/.gitignore +17 -0
  2. data/BSD-LICENSE.txt +31 -0
  3. data/GPL-LICENSE.txt +16 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +92 -0
  7. data/Rakefile +24 -0
  8. data/bin/ada_count +4 -0
  9. data/bin/asm_count +4 -0
  10. data/bin/awk_count +4 -0
  11. data/bin/break_filelist +4 -0
  12. data/bin/c_count +4 -0
  13. data/bin/cobol_count +4 -0
  14. data/bin/compute_all +4 -0
  15. data/bin/compute_sloc_lang +4 -0
  16. data/bin/count_extensions +4 -0
  17. data/bin/count_unknown_ext +4 -0
  18. data/bin/csh_count +4 -0
  19. data/bin/exp_count +4 -0
  20. data/bin/f90_count +4 -0
  21. data/bin/fortran_count +4 -0
  22. data/bin/generic_count +4 -0
  23. data/bin/get_sloc +4 -0
  24. data/bin/get_sloc_details +4 -0
  25. data/bin/haskell_count +4 -0
  26. data/bin/java_count +4 -0
  27. data/bin/jsp_count +4 -0
  28. data/bin/lex_count +4 -0
  29. data/bin/lexcount1 +4 -0
  30. data/bin/lisp_count +4 -0
  31. data/bin/make_filelists +4 -0
  32. data/bin/makefile_count +4 -0
  33. data/bin/ml_count +4 -0
  34. data/bin/modula3_count +4 -0
  35. data/bin/objc_count +4 -0
  36. data/bin/oclint-0.8 +4 -0
  37. data/bin/oclint-json-compilation-database +4 -0
  38. data/bin/oclint-xcodebuild +4 -0
  39. data/bin/pascal_count +4 -0
  40. data/bin/perl_count +4 -0
  41. data/bin/php_count +4 -0
  42. data/bin/pmd-cpd-objc +7 -0
  43. data/bin/print_sum +4 -0
  44. data/bin/python_count +4 -0
  45. data/bin/ruby_count +4 -0
  46. data/bin/sed_count +4 -0
  47. data/bin/sh_count +4 -0
  48. data/bin/show_filecount +4 -0
  49. data/bin/sloccount +4 -0
  50. data/bin/sql_count +4 -0
  51. data/bin/tcl_count +4 -0
  52. data/docs/jenkins-setup-violations.png +0 -0
  53. data/docs/jenkins-setup.jpg +0 -0
  54. data/externals/oclint/oclint-0.8 +0 -0
  55. data/externals/oclint/oclint-json-compilation-database +86 -0
  56. data/externals/oclint/oclint-xcodebuild +216 -0
  57. data/externals/pmd-cpd/ObjCLanguage-0.0.7-SNAPSHOT.jar +0 -0
  58. data/externals/pmd-cpd/pmd-4.2.5.jar +0 -0
  59. data/externals/sloccount/ada_count +27 -0
  60. data/externals/sloccount/asm_count +166 -0
  61. data/externals/sloccount/awk_count +27 -0
  62. data/externals/sloccount/break_filelist +1308 -0
  63. data/externals/sloccount/c_count +0 -0
  64. data/externals/sloccount/cobol_count +82 -0
  65. data/externals/sloccount/compute_all +87 -0
  66. data/externals/sloccount/compute_sloc_lang +66 -0
  67. data/externals/sloccount/count_extensions +56 -0
  68. data/externals/sloccount/count_unknown_ext +32 -0
  69. data/externals/sloccount/csh_count +27 -0
  70. data/externals/sloccount/exp_count +27 -0
  71. data/externals/sloccount/f90_count +81 -0
  72. data/externals/sloccount/fortran_count +83 -0
  73. data/externals/sloccount/generic_count +77 -0
  74. data/externals/sloccount/get_sloc +544 -0
  75. data/externals/sloccount/get_sloc_details +103 -0
  76. data/externals/sloccount/haskell_count +122 -0
  77. data/externals/sloccount/java_count +0 -0
  78. data/externals/sloccount/jsp_count +0 -0
  79. data/externals/sloccount/lex_count +70 -0
  80. data/externals/sloccount/lexcount1 +0 -0
  81. data/externals/sloccount/lisp_count +27 -0
  82. data/externals/sloccount/make_filelists +193 -0
  83. data/externals/sloccount/makefile_count +27 -0
  84. data/externals/sloccount/ml_count +0 -0
  85. data/externals/sloccount/modula3_count +65 -0
  86. data/externals/sloccount/objc_count +89 -0
  87. data/externals/sloccount/pascal_count +0 -0
  88. data/externals/sloccount/perl_count +147 -0
  89. data/externals/sloccount/php_count +0 -0
  90. data/externals/sloccount/print_sum +40 -0
  91. data/externals/sloccount/python_count +120 -0
  92. data/externals/sloccount/ruby_count +27 -0
  93. data/externals/sloccount/sed_count +27 -0
  94. data/externals/sloccount/sh_count +27 -0
  95. data/externals/sloccount/show_filecount +58 -0
  96. data/externals/sloccount/sloccount +258 -0
  97. data/externals/sloccount/sql_count +76 -0
  98. data/externals/sloccount/tcl_count +27 -0
  99. data/lib/objective-ci.rb +3 -0
  100. data/lib/objective_ci/ci_tasks.rb +142 -0
  101. data/lib/objective_ci/version.rb +3 -0
  102. data/objective-ci.gemspec +26 -0
  103. metadata +255 -0
@@ -0,0 +1,1308 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # break_filelist
4
+ # Take a list of dirs which contain a "filelist";
5
+ # creates files in each directory identifying which are C, C++, Perl, etc.
6
+ # For example, "ansic.dat" lists all ANSI C files contained in filelist.
7
+ # Note: ".h" files are ambiguous (they could be C or C++); the program
8
+ # uses heuristics to determine this.
9
+ # The list of .h files is also contained in h_list.dat.
10
+
11
+ # This is part of SLOCCount, a toolsuite that counts
12
+ # source lines of code (SLOC).
13
+ # Copyright (C) 2001-2004 David A. Wheeler.
14
+ #
15
+ # This program is free software; you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation; either version 2 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU General Public License
26
+ # along with this program; if not, write to the Free Software
27
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28
+ #
29
+ # To contact David A. Wheeler, see his website at:
30
+ # http://www.dwheeler.com.
31
+
32
+
33
+ # If adding a new language: add the logic to open the file,
34
+ # close the file, and detect & write to the file listing that language.
35
+
36
+ # Debatable decisions:
37
+ # Doesn't count .dsl files (stylesheets, which are partially LISP).
38
+ # Doesn't count .sql files (SQL queries & commands)
39
+
40
+ # Note - I don't try to distinguish between TCL and [incr TCL] (itcl),
41
+ # an OO extended version of TCL. For our purposes, it's all TCL.
42
+
43
+
44
+ use FileHandle;
45
+
46
+
47
+ # Set default configuration:
48
+
49
+ $duplicates_okay = 0; # Set to 1 if you want to count file duplicates.
50
+ $crossdups_okay = 0; # Set to 1 if duplicates okay in different filelists.
51
+ $autogen_okay = 0; # Set to 1 if you want to count autogen'ed files.
52
+ $noisy = 0; # Set to 1 if you want noisy reports.
53
+ %lang_list_files = ();
54
+
55
+ # The following extensions are NOT code:
56
+ %not_code_extensions = (
57
+ "html" => 1,
58
+ "in" => 1, # Debatable.
59
+ "xpm" => 1,
60
+ "po" => 1,
61
+ "am" => 1, # Debatable.
62
+ "1" => 1, # Man pages (documentation):
63
+ "2" => 1,
64
+ "3" => 1,
65
+ "4" => 1,
66
+ "5" => 1,
67
+ "6" => 1,
68
+ "7" => 1,
69
+ "8" => 1,
70
+ "9" => 1,
71
+ "n" => 1,
72
+ "gif" => 1,
73
+ "tfm" => 1,
74
+ "png" => 1,
75
+ "m4" => 1, # Debatable.
76
+ "bdf" => 1,
77
+ "sgml" => 1,
78
+ "mf" => 1,
79
+ "txt" => 1, "text" => 1,
80
+ "man" => 1,
81
+ "xbm" => 1,
82
+ "Tag" => 1,
83
+ "sgm" => 1,
84
+ "vf" => 1,
85
+ "tex" => 1,
86
+ "elc" => 1,
87
+ "gz" => 1,
88
+ "dic" => 1,
89
+ "pfb" => 1,
90
+ "fig" => 1,
91
+ "afm" => 1, # font metrics
92
+ "jpg" => 1,
93
+ "bmp" => 1,
94
+ "htm" => 1,
95
+ "kdelnk" => 1,
96
+ "desktop" => 1,
97
+ "pbm" => 1,
98
+ "pdf" => 1,
99
+ "ps" => 1, # Postscript is _USUALLY_ generated automatically.
100
+ "eps" => 1,
101
+ "doc" => 1,
102
+ "man" => 1,
103
+ "o" => 1, # Object code is generated from source code.
104
+ "a" => 1, # Static object code.
105
+ "so" => 1, # Dynamically-loaded object code.
106
+ "Y" => 1, # file compressed with "Yabba"
107
+ "Z" => 1, # file compressed with "compress"
108
+ "ad" => 1, # X application default resource file.
109
+ "arc" => 1, # arc(1) archive
110
+ "arj" => 1, # arj(1) archive
111
+ "au" => 1, # Audio sound filearj(1) archive
112
+ "wav" => 1,
113
+ "bak" => 1, # Backup files - we only want to count the "real" files.
114
+ "bz2" => 1, # bzip2(1) compressed file
115
+ "mp3" => 1, # zip archive
116
+ "tgz" => 1, # tarball
117
+ "zip" => 1, # zip archive
118
+ );
119
+
120
+ # The following filenames are NOT code:
121
+ %not_code_filenames = (
122
+ "README" => 1,
123
+ "Readme" => 1,
124
+ "readme" => 1,
125
+ "README.tk" => 1, # used in kdemultimedia, it's confusing.
126
+ "Changelog" => 1,
127
+ "ChangeLog" => 1,
128
+ "Repository" => 1,
129
+ "CHANGES" => 1,
130
+ "Changes" => 1,
131
+ ".cvsignore" => 1,
132
+ "Root" => 1, # CVS.
133
+ "BUGS" => 1,
134
+ "TODO" => 1,
135
+ "COPYING" => 1,
136
+ "MAINTAINERS" => 1,
137
+ "Entries" => 1,
138
+ # Skip "iconfig.h" files; they're used in Imakefiles
139
+ # (used in xlockmore):
140
+ "iconfig.h" => 1,
141
+ );
142
+
143
+
144
+ # A filename ending in the following extensions usually maps to the
145
+ # given language:
146
+
147
+ # TODO: See suffixes(7)
148
+ # .al Perl autoload file
149
+ # .am automake input
150
+
151
+ %file_extensions = (
152
+ "c" => "ansic",
153
+ "ec" => "ansic", # Informix C.
154
+ "ecp" => "ansic", # Informix C.
155
+ "pgc" => "ansic", # Postgres embedded C/C++ (guess C)
156
+ "C" => "cpp", "cpp" => "cpp", "cxx" => "cpp", "cc" => "cpp",
157
+ "pcc" => "cpp", # Input to Oracle C++ preproc.
158
+ "m" => "objc",
159
+ # C# (C-sharp) is named 'cs', not 'c#', because
160
+ # the '#' is a comment character and I'm trying to
161
+ # avoid bug-prone conventions.
162
+ # C# doesn't support header files.
163
+ "cs" => "cs",
164
+ # Header files are allocated to the "h" language, and then
165
+ # copied to the correct location later so that C/C++/Objective-C
166
+ # can be separated.
167
+ "h" => "h", "H" => "h", "hpp" => "h", "hh" => "h",
168
+ "ada" => "ada", "adb" => "ada", "ads" => "ada",
169
+ "pad" => "ada", # Oracle Ada preprocessor.
170
+ "f" => "fortran", "F" => "fortran", # This catches "wokka.F" as Fortran.
171
+ # Warning: "Freeze" format also uses .f. Haven't heard of problems,
172
+ # freeze is extremely rare and even more rare in source code directories.
173
+ "f77" => "fortran", "F77" => "fortran",
174
+ "f90" => "f90", "F90" => "f90",
175
+ "cob" => "cobol", "cbl" => "cobol",
176
+ "COB" => "cobol", "CBL" => "cobol", # Yes, people do create wokka.CBL files
177
+ "p" => "pascal", "pas" => "pascal", "pp" => "pascal", "dpr" => "pascal",
178
+ "py" => "python",
179
+ "s" => "asm", "S" => "asm", "asm" => "asm",
180
+ "sh" => "sh", "bash" => "sh",
181
+ "csh" => "csh", "tcsh" => "csh",
182
+ "java" => "java",
183
+ "lisp" => "lisp", "el" => "lisp", "scm" => "lisp", "sc" => "lisp",
184
+ "lsp" => "lisp", "cl" => "lisp",
185
+ "jl" => "lisp",
186
+ "tcl" => "tcl", "tk" => "tcl", "itk" => "tcl",
187
+ "exp" => "exp",
188
+ "pl" => "perl", "pm" => "perl", "perl" => "perl", "ph" => "perl",
189
+ "awk" => "awk",
190
+ "sed" => "sed",
191
+ "y" => "yacc",
192
+ "l" => "lex",
193
+ "makefile" => "makefile",
194
+ "sql" => "sql",
195
+ "php" => "php", "php3" => "php", "php4" => "php", "php5" => "php",
196
+ "php6" => "php",
197
+ "inc" => "inc", # inc MAY be PHP - we'll handle it specially.
198
+ "m3" => "modula3", "i3" => "modula3",
199
+ "mg" => "modula3", "ig" => "modula3",
200
+ "ml" => "ml", "mli" => "ml",
201
+ "mly" => "ml", # ocamlyacc. In fact this is half-yacc half-ML, especially
202
+ # comments in yacc part are C-like, not ML like.
203
+ "mll" => "ml", # ocamllex, no such problems as in ocamlyacc
204
+ "rb" => "ruby",
205
+ "hs" => "haskell", "lhs" => "haskell",
206
+ # ???: .pco is Oracle Cobol
207
+ "jsp" => "jsp", # Java server pages
208
+ );
209
+
210
+
211
+ # GLOBAL VARIABLES
212
+
213
+ $dup_count = 0;
214
+
215
+ $warning_from_first_line = "";
216
+
217
+ %examined_directories = (); # Keys = Names of directories examined this run.
218
+
219
+ $duplistfile = "";
220
+
221
+ ###########
222
+
223
+
224
+ # Handle re-opening individual CODE_FILEs.
225
+ # CODE_FILE is public
226
+
227
+ # Private value:
228
+ $opened_file_name = "";
229
+
230
+ sub reopen {
231
+ # Open file if it isn't already, else rewind.
232
+ # If filename is "", close any open file.
233
+ my $filename = shift;
234
+ chomp($filename);
235
+ # print("DEBUG: reopen($filename)\n");
236
+ if ($filename eq "") {
237
+ if ($opened_file_name) {close(CODE_FILE);}
238
+ $opened_file_name = "";
239
+ return;
240
+ }
241
+ if ($filename eq $opened_file_name) {
242
+ seek CODE_FILE, 0, 0; # Rewind.
243
+ } else { # We're opening a new file.
244
+ if ($opened_file_name) {close(CODE_FILE)}
245
+ open(CODE_FILE, "<$filename\0") || die "Can't open $filename";
246
+ $opened_file_name = $filename;
247
+ }
248
+ }
249
+
250
+ ###########
251
+
252
+ sub looks_like_cpp {
253
+ # returns a confidence level - does the file looks like it's C++?
254
+ my $filename = shift;
255
+ my $confidence = 0;
256
+ chomp($filename);
257
+ open( SUSPECT, "<$filename");
258
+ while (defined($_ = <SUSPECT>)) {
259
+ if (m/^\s*class\b.*\{/) { # "}"
260
+ close(SUSPECT);
261
+ return 2;
262
+ }
263
+ if (m/^\s*class\b/) {
264
+ $confidence = 1;
265
+ }
266
+ }
267
+ close(SUSPECT);
268
+ return $confidence;
269
+ }
270
+
271
+
272
+ # Cache which files are objective-C or not.
273
+ # Key is the full file pathname; value is 1 if objective-C (else 0).
274
+ %objective_c_files = ();
275
+
276
+ sub really_is_objc {
277
+ # Given filename, returns TRUE if its contents really are objective-C.
278
+ my $filename = shift;
279
+ chomp($filename);
280
+
281
+ my $is_objc = 0; # Value to determine.
282
+ my $brace_lines = 0; # Lines that begin/end with curly braces.
283
+ my $plus_minus = 0; # Lines that begin with + or -.
284
+ my $word_main = 0; # Did we find "main("?
285
+ my $special = 0; # Did we find a special Objective-C pattern?
286
+
287
+ # Return cached result, if available:
288
+ if ($objective_c_files{$filename}) { return $objective_c_files{$filename};}
289
+
290
+ open(OBJC_FILE, "<$filename") ||
291
+ die "Can't open $filename to determine if it's objective C.\n";
292
+ while(<OBJC_FILE>) {
293
+
294
+ if (m/^\s*[{}]/ || m/[{}];?\s*$/) { $brace_lines++;}
295
+ if (m/^\s*[+-]/) {$plus_minus++;}
296
+ if (m/\bmain\s*\(/) {$word_main++;} # "main" followed by "("?
297
+ # Handle /usr/src/redhat/BUILD/egcs-1.1.2/gcc/objc/linking.m:
298
+ if (m/^\s*\[object name\];\s*$/i) {$special=1;}
299
+ }
300
+ close(OBJC_FILE);
301
+
302
+ if (($brace_lines > 1) && (($plus_minus > 1) || $word_main || $special))
303
+ {$is_objc = 1;}
304
+
305
+ $objective_c_files{$filename} = $is_objc; # Store result in cache.
306
+
307
+ return $is_objc;
308
+ }
309
+
310
+
311
+ # Cache which files are lex or not.
312
+ # Key is the full file pathname; value is 1 if lex (else 0).
313
+ %lex_files = ();
314
+
315
+ sub really_is_lex {
316
+ # Given filename, returns TRUE if its contents really is lex.
317
+ # lex file must have "%%", "%{", and "%}".
318
+ # In theory, a lex file doesn't need "%{" and "%}", but in practice
319
+ # they all have them, and requiring them avoid mislabeling a
320
+ # non-lexfile as a lex file.
321
+
322
+ my $filename = shift;
323
+ chomp($filename);
324
+
325
+ my $is_lex = 0; # Value to determine.
326
+ my $percent_percent = 0;
327
+ my $percent_opencurly = 0;
328
+ my $percent_closecurly = 0;
329
+
330
+ # Return cached result, if available:
331
+ if ($lex_files{$filename}) { return $lex_files{$filename};}
332
+
333
+ open(LEX_FILE, "<$filename") ||
334
+ die "Can't open $filename to determine if it's lex.\n";
335
+ while(<LEX_FILE>) {
336
+ $percent_percent++ if (m/^\s*\%\%/);
337
+ $percent_opencurly++ if (m/^\s*\%\{/);
338
+ $percent_closecurly++ if (m/^\s*\%\}/);
339
+ }
340
+ close(LEX_FILE);
341
+
342
+ if ($percent_percent && $percent_opencurly && $percent_closecurly)
343
+ {$is_lex = 1;}
344
+
345
+ $lex_files{$filename} = $is_lex; # Store result in cache.
346
+
347
+ return $is_lex;
348
+ }
349
+
350
+
351
+ # Cache which files are expect or not.
352
+ # Key is the full file pathname; value is 1 if it is (else 0).
353
+ %expect_files = ();
354
+
355
+ sub really_is_expect {
356
+ # Given filename, returns TRUE if its contents really are Expect.
357
+ # Many "exp" files (such as in Apache and Mesa) are just "export" data,
358
+ # summarizing something else # (e.g., its interface).
359
+ # Sometimes (like in RPM) it's just misc. data.
360
+ # Thus, we need to look at the file to determine
361
+ # if it's really an "expect" file.
362
+
363
+ my $filename = shift;
364
+ chomp($filename);
365
+
366
+ # The heuristic is as follows: it's Expect _IF_ it:
367
+ # 1. has "load_lib" command and either "#" comments or {}.
368
+ # 2. {, }, and one of: proc, if, [...], expect
369
+
370
+ my $is_expect = 0; # Value to determine.
371
+
372
+ my $begin_brace = 0; # Lines that begin with curly braces.
373
+ my $end_brace = 0; # Lines that begin with curly braces.
374
+ my $load_lib = 0; # Lines with the Load_lib command.
375
+ my $found_proc = 0;
376
+ my $found_if = 0;
377
+ my $found_brackets = 0;
378
+ my $found_expect = 0;
379
+ my $found_pound = 0;
380
+
381
+ # Return cached result, if available:
382
+ if ($expect_files{$filename}) { return expect_files{$filename};}
383
+
384
+ open(EXPECT_FILE, "<$filename") ||
385
+ die "Can't open $filename to determine if it's expect.\n";
386
+ while(<EXPECT_FILE>) {
387
+
388
+ if (m/#/) {$found_pound++; s/#.*//;}
389
+ if (m/^\s*\{/) { $begin_brace++;}
390
+ if (m/\{\s*$/) { $begin_brace++;}
391
+ if (m/^\s*\}/) { $end_brace++;}
392
+ if (m/\};?\s*$/) { $end_brace++;}
393
+ if (m/^\s*load_lib\s+\S/) { $load_lib++;}
394
+ if (m/^\s*proc\s/) { $found_proc++;}
395
+ if (m/^\s*if\s/) { $found_if++;}
396
+ if (m/\[.*\]/) { $found_brackets++;}
397
+ if (m/^\s*expect\s/) { $found_expect++;}
398
+ }
399
+ close(EXPECT_FILE);
400
+
401
+ if ($load_lib && ($found_pound || ($begin_brace && $end_brace)))
402
+ {$is_expect = 1;}
403
+ if ( $begin_brace && $end_brace &&
404
+ ($found_proc || $found_if || $found_brackets || $found_expect))
405
+ {$is_expect = 1;}
406
+
407
+ $expect_files{$filename} = $is_expect; # Store result in cache.
408
+
409
+ return $is_expect;
410
+ }
411
+
412
+
413
+ # Cached values.
414
+ %pascal_files = ();
415
+
416
+ sub really_is_pascal {
417
+ # Given filename, returns TRUE if its contents really are Pascal.
418
+
419
+ # This isn't as obvious as it seems.
420
+ # Many ".p" files are Perl files
421
+ # (such as /usr/src/redhat/BUILD/ispell-3.1/dicts/czech/glob.p),
422
+ # others are C extractions
423
+ # (such as /usr/src/redhat/BUILD/linux/include/linux/umsdos_fs.p
424
+ # and some files in linuxconf).
425
+ # However, test files in "p2c" really are Pascal, for example.
426
+
427
+ # Note that /usr/src/redhat/BUILD/ucd-snmp-4.1.1/ov/bitmaps/UCD.20.p
428
+ # is actually C code. The heuristics determine that they're not Pascal,
429
+ # but because it ends in ".p" it's not counted as C code either.
430
+ # I believe this is actually correct behavior, because frankly it
431
+ # looks like it's automatically generated (it's a bitmap expressed as code).
432
+ # Rather than guess otherwise, we don't include it in a list of
433
+ # source files. Let's face it, someone who creates C files ending in ".p"
434
+ # and expects them to be counted by default as C files in SLOCCount needs
435
+ # their head examined. I suggest examining their head
436
+ # with a sucker rod (see syslogd(8) for more on sucker rods).
437
+
438
+ # This heuristic counts as Pascal such files such as:
439
+ # /usr/src/redhat/BUILD/teTeX-1.0/texk/web2c/tangleboot.p
440
+ # Which is hand-generated. We don't count woven documents now anyway,
441
+ # so this is justifiable.
442
+
443
+ my $filename = shift;
444
+ chomp($filename);
445
+
446
+ # The heuristic is as follows: it's Pascal _IF_ it has all of the following
447
+ # (ignoring {...} and (*...*) comments):
448
+ # 1. "^..program NAME" or "^..unit NAME",
449
+ # 2. "procedure", "function", "^..interface", or "^..implementation",
450
+ # 3. a "begin", and
451
+ # 4. it ends with "end.",
452
+ #
453
+ # Or it has all of the following:
454
+ # 1. "^..module NAME" and
455
+ # 2. it ends with "end.".
456
+ #
457
+ # Or it has all of the following:
458
+ # 1. "^..program NAME",
459
+ # 2. a "begin", and
460
+ # 3. it ends with "end.".
461
+ #
462
+ # The "end." requirements in particular filter out non-Pascal.
463
+ #
464
+ # Note (jgb): this does not detect Pascal main files in fpc, like
465
+ # fpc-1.0.4/api/test/testterminfo.pas, which does not have "program" in
466
+ # it
467
+
468
+ my $is_pascal = 0; # Value to determine.
469
+
470
+ my $has_program = 0;
471
+ my $has_unit = 0;
472
+ my $has_module = 0;
473
+ my $has_procedure_or_function = 0;
474
+ my $found_begin = 0;
475
+ my $found_terminating_end = 0;
476
+
477
+ # Return cached result, if available:
478
+ if ($pascal_files{$filename}) { return pascal_files{$filename};}
479
+
480
+ open(PASCAL_FILE, "<$filename") ||
481
+ die "Can't open $filename to determine if it's pascal.\n";
482
+ while(<PASCAL_FILE>) {
483
+ s/\{.*?\}//g; # Ignore {...} comments on this line; imperfect, but effective.
484
+ s/\(\*.*?\*\)//g; # Ignore (*...*) comments on this line; imperfect, but effective.
485
+ if (m/\bprogram\s+[A-Za-z]/i) {$has_program=1;}
486
+ if (m/\bunit\s+[A-Za-z]/i) {$has_unit=1;}
487
+ if (m/\bmodule\s+[A-Za-z]/i) {$has_module=1;}
488
+ if (m/\bprocedure\b/i) { $has_procedure_or_function = 1; }
489
+ if (m/\bfunction\b/i) { $has_procedure_or_function = 1; }
490
+ if (m/^\s*interface\s+/i) { $has_procedure_or_function = 1; }
491
+ if (m/^\s*implementation\s+/i) { $has_procedure_or_function = 1; }
492
+ if (m/\bbegin\b/i) { $has_begin = 1; }
493
+ # Originally I said:
494
+ # "This heuristic fails if there are multi-line comments after
495
+ # "end."; I haven't seen that in real Pascal programs:"
496
+ # But jgb found there are a good quantity of them in Debian, specially in
497
+ # fpc (at the end of a lot of files there is a multiline comment
498
+ # with the changelog for the file).
499
+ # Therefore, assume Pascal if "end." appears anywhere in the file.
500
+ if (m/end\.\s*$/i) {$found_terminating_end = 1;}
501
+ # elsif (m/\S/) {$found_terminating_end = 0;}
502
+ }
503
+ close(PASCAL_FILE);
504
+
505
+ # Okay, we've examined the entire file looking for clues;
506
+ # let's use those clues to determine if it's really Pascal:
507
+
508
+ if ( ( ($has_unit || $has_program) && $has_procedure_or_function &&
509
+ $has_begin && $found_terminating_end ) ||
510
+ ( $has_module && $found_terminating_end ) ||
511
+ ( $has_program && $has_begin && $found_terminating_end ) )
512
+ {$is_pascal = 1;}
513
+
514
+ $pascal_files{$filename} = $is_pascal; # Store result in cache.
515
+
516
+ return $is_pascal;
517
+ }
518
+
519
+ sub really_is_incpascal {
520
+ # Given filename, returns TRUE if its contents really are Pascal.
521
+ # For .inc files (mainly seen in fpc)
522
+
523
+ my $filename = shift;
524
+ chomp($filename);
525
+
526
+ # The heuristic is as follows: it is Pacal if any of the following:
527
+ # 1. really_is_pascal returns true
528
+ # 2. Any usual reserverd word is found (program, unit, const, begin...)
529
+
530
+ # If the general routine for Pascal files works, we have it
531
+ if (&really_is_pascal ($filename)) {
532
+ $pascal_files{$filename} = 1;
533
+ return 1;
534
+ }
535
+
536
+ my $is_pascal = 0; # Value to determine.
537
+ my $found_begin = 0;
538
+
539
+ open(PASCAL_FILE, "<$filename") ||
540
+ die "Can't open $filename to determine if it's pascal.\n";
541
+ while(<PASCAL_FILE>) {
542
+ s/\{.*?\}//g; # Ignore {...} comments on this line; imperfect, but effective.
543
+ s/\(\*.*?\*\)//g; # Ignore (*...*) comments on this line; imperfect, but effective.
544
+ if (m/\bprogram\s+[A-Za-z]/i) {$is_pascal=1;}
545
+ if (m/\bunit\s+[A-Za-z]/i) {$is_pascal=1;}
546
+ if (m/\bmodule\s+[A-Za-z]/i) {$is_pascal=1;}
547
+ if (m/\bprocedure\b/i) {$is_pascal = 1; }
548
+ if (m/\bfunction\b/i) {$is_pascal = 1; }
549
+ if (m/^\s*interface\s+/i) {$is_pascal = 1; }
550
+ if (m/^\s*implementation\s+/i) {$is_pascal = 1; }
551
+ if (m/\bconstant\s+/i) {$is_pascal=1;}
552
+ if (m/\bbegin\b/i) { $found_begin = 1; }
553
+ if ((m/end\.\s*$/i) && ($found_begin = 1)) {$is_pascal = 1;}
554
+ if ($is_pascal) {
555
+ last;
556
+ }
557
+ }
558
+
559
+ close(PASCAL_FILE);
560
+ $pascal_files{$filename} = $is_pascal; # Store result in cache.
561
+ return $is_pascal;
562
+ }
563
+
564
+ # Cache which files are php or not.
565
+ # Key is the full file pathname; value is 1 if it is (else 0).
566
+ %php_files = ();
567
+
568
+ sub really_is_php {
569
+ # Given filename, returns TRUE if its contents really is php.
570
+
571
+ my $filename = shift;
572
+ chomp($filename);
573
+
574
+ my $is_php = 0; # Value to determine.
575
+ # Need to find a matching pair of surrounds, with ending after beginning:
576
+ my $normal_surround = 0; # <?; bit 0 = <?, bit 1 = ?>
577
+ my $script_surround = 0; # <script..>; bit 0 = <script language="php">
578
+ my $asp_surround = 0; # <%; bit 0 = <%, bit 1 = %>
579
+
580
+ # Return cached result, if available:
581
+ if ($php_files{$filename}) { return $php_files{$filename};}
582
+
583
+ open(PHP_FILE, "<$filename") ||
584
+ die "Can't open $filename to determine if it's php.\n";
585
+ while(<PHP_FILE>) {
586
+ if (m/\<\?/) { $normal_surround |= 1; }
587
+ if (m/\?\>/ && ($normal_surround & 1)) { $normal_surround |= 2; }
588
+ if (m/\<script.*language="?php"?/i) { $script_surround |= 1; }
589
+ if (m/\<\/script\>/i && ($script_surround & 1)) { $script_surround |= 2; }
590
+ if (m/\<\%/) { $asp_surround |= 1; }
591
+ if (m/\%\>/ && ($asp_surround & 1)) { $asp_surround |= 2; }
592
+ }
593
+ close(PHP_FILE);
594
+
595
+ if ( ($normal_surround == 3) || ($script_surround == 3) ||
596
+ ($asp_surround == 3)) {
597
+ $is_php = 1;
598
+ }
599
+
600
+ $php_files{$filename} = $is_php; # Store result in cache.
601
+
602
+ return $is_php;
603
+ }
604
+
605
+
606
+
607
+ sub examine_dir {
608
+ # Given a file, determine if there are only C++, OBJC, C, or a mixture
609
+ # in the same directory. Returns "ansic", "cpp", "objc" or "mix"
610
+ my $filename = shift;
611
+ chomp($filename);
612
+ my $dirname = $filename;
613
+ $dirname =~ s/\/[^\/]*$//;
614
+ my $saw_ansic_in_dir = 0;
615
+ my $saw_pc_in_dir = 0; # ".pc" may mean Oracle C.
616
+ my $saw_pcc_in_dir = 0; # ".pc" may mean Oracle C++.
617
+ my $saw_cpp_in_dir = 0;
618
+ my $saw_objc_in_dir = 0;
619
+ opendir(DIR, $dirname) || die "can't opendir $dirname";
620
+ while (defined($_ = readdir(DIR))) {
621
+ chomp;
622
+ next if (!$_);
623
+ if (m/\.(cpp|C|cxx|cc)$/ && -f "$dirname/$_") {$saw_cpp_in_dir = 1;}
624
+ if (m/\.c$/ && -f "$dirname/$_") {$saw_ansic_in_dir = 1;}
625
+ if (m/\.pc$/ && -f "$dirname/$_") {$saw_pc_in_dir = 1;}
626
+ if (m/\.pcc$/ && -f "$dirname/$_") {$saw_pcc_in_dir = 1;}
627
+ if (m/\.m$/ && -f "$dirname/$_" && &really_is_objc($dirname . "/" . $_))
628
+ {$saw_objc_in_dir = 1;}
629
+ if (($saw_ansic_in_dir + $saw_cpp_in_dir + $saw_objc_in_dir) > 1) {
630
+ closedir(DIR);
631
+ return "mix";
632
+ }
633
+ }
634
+ # Done searching; we saw at most one type.
635
+ if ($saw_ansic_in_dir) {return "c";}
636
+ elsif ($saw_cpp_in_dir) {return "cpp";}
637
+ elsif ($saw_objc_in_dir) {return "objc";}
638
+ elsif ($saw_pc_in_dir && (!$saw_pcc_in_dir)) {return "c";} # Guess "C".
639
+ elsif ($saw_pcc_in_dir && (!$saw_pc_in_dir)) {return "cpp";} # Guess "C".
640
+ else {return "mix";} # We didn't see anything... so let's say "mix".
641
+ }
642
+
643
+ sub was_generated_automatically() {
644
+ # Determine if the file was generated automatically.
645
+ # Use a simple heuristic: check if first few lines have phrases like
646
+ # "generated automatically", "automatically generated", "Generated by",
647
+ # or "do not edit" as the first
648
+ # words in the line (after possible comment markers and spaces).
649
+ my $filename = shift;
650
+
651
+ if ($autogen_okay) {return 0;};
652
+
653
+ chomp($filename);
654
+ reopen($filename);
655
+ $i = 15; # Look at first 15 lines.
656
+ while (defined($_ = <CODE_FILE>)) {
657
+ if (m/^[\s#\/\*;\-\%]*generated automatically/i ||
658
+ m/^[\s#\/\*;\-\%]*automatically generated/i ||
659
+ m/^[\s#\/\*;\-\%]*generated by /i || # libtool uses this.
660
+ m/^[\s#\/\*;\-\%]*a lexical scanner generated by flex/i ||
661
+ m/^[\s#\/\*;\-\%]*this is a generated file/i || # TeTex uses this.
662
+ m/^[\s#\/\*;\-\%]*generated with the.*utility/i || # TeTex uses this.
663
+ m/^[\s#\/\*;\-\%]*do not edit/i) {
664
+ return 1;
665
+ }
666
+ $i--;
667
+ last if $i <= 0;
668
+ }
669
+ return 0;
670
+ }
671
+
672
+
673
+ # Previous files added, indexed by digest:
674
+
675
+ %previous_files = ();
676
+
677
+ $cached_digest = "";
678
+ $cached_digest_filename = "";
679
+
680
+ $digest_method = undef;
681
+
682
+ sub compute_digest_given_method {
683
+ my $filename = shift;
684
+ my $method = shift;
685
+ my $result;
686
+
687
+ if ($method eq "md5sum") {
688
+ open(FH, "-|", "md5sum", $filename) or return undef;
689
+ $result = <FH>;
690
+ close FH;
691
+ return undef if ! defined($result);
692
+ chomp($result);
693
+ $result =~ s/^\s*//; # Not needed for GNU Textutils.
694
+ $result =~ s/[^a-fA-F0-9].*//; # Strip away end.
695
+ } elsif ($method eq "md5") {
696
+ open(FH, "-|", "md5", $filename) or return undef;
697
+ $result = <FH>;
698
+ close FH;
699
+ return undef if ! defined($result);
700
+ chomp($result);
701
+ $result =~ s/^.* //; # Strip away beginning.
702
+ } elsif ($method eq "openssl") {
703
+ open(FH, "-|", "openssl", "dgst", "-md5", $filename) or return undef;
704
+ $result = <FH>;
705
+ close FH;
706
+ return undef if ! defined($result);
707
+ chomp($result);
708
+ $result =~ s/^.* //; # Strip away beginning.
709
+ } else {
710
+ # "Can't happen"
711
+ die "Unknown method";
712
+ }
713
+ return $result;
714
+ }
715
+
716
+ sub compute_digest {
717
+ my $filename = shift;
718
+ my $result;
719
+ if (defined($digest_method)) {
720
+ $result = compute_digest_given_method($filename, $digest_method);
721
+ } else {
722
+ # Try each method in turn until one works.
723
+ # There doesn't seem to be a way in perl to disable an error message
724
+ # display if the command is missing, which is annoying. However, the
725
+ # program is more robust if we check for the command each time we run.
726
+ print "Finding a working MD5 command....\n";
727
+ foreach $m ("md5sum", "md5", "openssl") {
728
+ $result = compute_digest_given_method($filename, $m);
729
+ if (defined($result)) {
730
+ $digest_method = $m;
731
+ last;
732
+ }
733
+ }
734
+ if (!defined($digest_method)) {
735
+ die "Failure - could not find a working md5 program using $filename.";
736
+ }
737
+ print "Found a working MD5 command.\n";
738
+ }
739
+ return $result;
740
+ }
741
+
742
+ sub get_digest {
743
+ my $filename = shift;
744
+ my $result;
745
+ # First, check the cache -- did we just compute this?
746
+ if ($filename eq $cached_digest_filename) {
747
+ return $cached_digest; # We did, so here's what it was.
748
+ }
749
+
750
+ $result = compute_digest($filename);
751
+ # Store in most-recently-used cache.
752
+ $cached_digest = $result;
753
+ $cached_digest_filename = $filename;
754
+ return $result;
755
+ }
756
+
757
+ sub already_added {
758
+ # returns the first file's name with the same contents,
759
+ # else returns the empty string.
760
+
761
+ my $filename = shift;
762
+ my $digest = &get_digest($filename);
763
+
764
+ if ($previous_files{$digest}) {
765
+ return $previous_files{$digest};
766
+ } else {
767
+ return "";
768
+ }
769
+ }
770
+
771
+ sub close_lang_lists {
772
+ my $lang;
773
+ my $file;
774
+ while (($lang, $file) = each(%lang_list_files)) {
775
+ $file->close(); # Ignore any errors on close, there's little we can do.
776
+ }
777
+ %lang_list_files = ();
778
+ }
779
+
780
+ sub force_record_file_type {
781
+ my ($filename, $type) = @_;
782
+
783
+ if (!$type) {die "ERROR! File $filename, type $file_type\n";}
784
+ if ($type eq "c") {$type = "ansic";};
785
+ if (!defined($lang_list_files{$type})) {
786
+ $lang_list_files{$type} = new FileHandle("${dir}/${type}_list.dat", "w") ||
787
+ die "Could not open ${dir}/${type}_list.dat";
788
+ }
789
+ $lang_list_files{$type}->printf("%s\n", $filename);
790
+ }
791
+
792
+
793
+ sub record_file_type {
794
+ my ($filename, $type) = @_;
795
+ # First check if the file should be auto, dup, or zero - and add there
796
+ # if so. Otherwise, add to record of 'type'.
797
+
798
+ my $first_filename;
799
+
800
+ if (-z $filename) {
801
+ force_record_file_type($filename, "zero");
802
+ return;
803
+ }
804
+
805
+ if (&was_generated_automatically($filename)) {
806
+ force_record_file_type($filename, "auto");
807
+ return;
808
+ }
809
+
810
+ unless (($duplicates_okay) || ($type eq "not") || ($type eq "unknown")) {
811
+ $first_filename = &already_added($filename);
812
+ if ($first_filename) {
813
+ print "Note: $filename dups $first_filename\n" if $noisy;
814
+ force_record_file_type("$filename dups $first_filename", "dup");
815
+ $dup_count++;
816
+ return;
817
+ } else { # This isn't a duplicate - record that info, as needed.
818
+ my $digest = &get_digest($filename);
819
+ $previous_files{$digest} = $filename;
820
+ if ($duplistfile) {
821
+ print DUPLIST "$digest $filename\n";
822
+ }
823
+ }
824
+ }
825
+
826
+ force_record_file_type($filename, $type);
827
+ }
828
+
829
+
830
+
831
+ sub file_type_from_contents() {
832
+ # Determine if file type is a scripting language, and if so, return it.
833
+ # Returns its type as a string, or the empty string if it's undetermined.
834
+ my $filename = shift;
835
+ my $command;
836
+ chomp($filename);
837
+ reopen($filename);
838
+ # Don't do $firstline = <CODE_FILE> here because the file may be binary;
839
+ # instead, read in a fixed number of bytes:
840
+ read CODE_FILE, $firstline, 200;
841
+ return "" if (!$_);
842
+ chomp($firstline);
843
+ if (!$_) {return "";}
844
+ if (!$firstline) {return "";}
845
+
846
+ # Handle weirdness: If there's a ".cpp" file beginning with .\"
847
+ # then it clearly isn't C/C++... it's a man page. People who create
848
+ # and distribute man pages with such filename extensions should have
849
+ # a fingernail removed, slowly :-).
850
+ if (($firstline =~ m@^[,.]\\"@) &&
851
+ $filename =~ m@\.(c|cpp|C|cxx|cc)$@) {return "not";}
852
+
853
+
854
+ if (!($firstline =~ m@^#!@)) {return "";} # No script indicator here.
855
+
856
+ # studying $firstline doesn't speed things up, unfortunately.
857
+
858
+ # I once used a pattern that only acknowledged very specific directories,
859
+ # but I found that many test cases use unusual script locations
860
+ # (to ensure that they're invoking the correct program they're testing).
861
+ # Thus, we depend on the program being named with postfixed whitespace,
862
+ # and either begin named by itself or with a series of lowercase
863
+ # directories ending in "/".
864
+
865
+ # I developed these patterns by starting with patterns that appeared
866
+ # correct, and then examined the output (esp. warning messages) to see
867
+ # what I'd missed.
868
+
869
+ $command = "";
870
+
871
+ # Strip out any calls to sudo
872
+ if ($firstline =~ m@^#!\s*/(usr/)?bin/sudo\s+(/.*)@) {
873
+ $firstline = "#!" . $2;
874
+ }
875
+
876
+ if ($firstline =~ m@^#!\s*/(usr/)?bin/env\s+([a-zA-Z0-9\._]+)(\s|\Z)@i) {
877
+ $command = $2;
878
+ } elsif ($firstline =~ m@^#!\s*([a-zA-Z0-9\/\.]+\/)?([a-zA-Z0-9\._]+)(\s|\Z)@) {
879
+ $command = $2;
880
+ }
881
+
882
+ if ( ($command =~ m/^(bash|ksh|zsh|pdksh|sh)[0-9\.]*(\.exe)?$/i) ||
883
+ ($firstline =~
884
+ m~^#!\s*\@_?(SCRIPT_)?(PATH_)?(BA|K)?SH(ELL)?(\d+)?\@?(\s|\Z)~)) {
885
+ # Note: wish(1) uses a funny trick; see wish(1) for more info.
886
+ # The following code detects this unusual wish convention.
887
+ if ($firstline =~ m@exec wish(\s|\Z)@i) {
888
+ return "tcl"; # return the type for wish.
889
+ }
890
+ # Otherwise, it's shell.
891
+ return "sh";
892
+ }
893
+ if ( ($command =~ m/^(t?csh\d*)[0-9\.]*(\.exe)?$/i) ||
894
+ ($firstline =~ m@^#!\s*xCSH_PATHx(\s|\Z)@)) {
895
+ return "csh";
896
+ }
897
+ if ( ($command =~ m/^(mini)?perl[0-9\.]*(\.exe)?$/i) ||
898
+ ($command =~ m/^speedycgi[0-9\.]*(\.exe)?$/i) ||
899
+ ($firstline =~ m~^#!\s*\@_?(PATH_)?PERL\d*(PROG)?\@(\s|\Z)~) ||
900
+ ($firstline =~ m~^#!\s*xPERL_PATHx(\s|\Z)~)) {
901
+ return "perl";
902
+ }
903
+ if ($command =~ m/^python[0-9\.]*(\.exe)?$/i) {
904
+ return "python";
905
+ }
906
+ if ($command =~ m/^(tcl|tclsh|bltwish|wish|wishx|WISH)[0-9\.]*(\.exe)?$/i) {
907
+ return "tcl";
908
+ }
909
+ if ($command =~ m/^expectk?[0-9\.]*(\.exe)?$/i) { return "exp"; }
910
+ if ($command =~ m/^[ng]?awk[0-9\.]*(\.exe)?$/i) { return "awk"; }
911
+ if ($command =~ m/^sed$/i) { return "sed"; }
912
+ if ($command =~ m/^guile[0-9\.]*$/i) { return "lisp"; }
913
+ if ($firstline =~ m@^#!.*make\b@i) { # We'll claim that #! make is a makefile.
914
+ return "makefile";
915
+ }
916
+ if ($firstline =~ m@^#!\s*\.(\s|\Z)@) { # Lonely period.
917
+ return ""; # Ignore the first line, it's not helping.
918
+ }
919
+ if ($firstline =~ m@^#!\s*\Z@) { # Empty line.
920
+ return ""; # Ignore the first line, it's not helping.
921
+ }
922
+ if ($firstline =~ m@^#!\s*/dev/null@) { # /dev/null is the script?!?
923
+ return ""; # Ignore nonsense ("/dev/null").
924
+ }
925
+ if ($firstline =~ m@^#!\s*/unix(\s|Z)@) {
926
+ return ""; # Ignore nonsense ("/unix").
927
+ }
928
+ if (($filename =~ m@\.pl$@) || ($filename =~ m@\.pm$@)) {
929
+ return ""; # Don't warn about files that will be ID'd as perl files.
930
+ }
931
+ if (($filename =~ m@\.sh$@)) {
932
+ return ""; # Don't warn about files that will be ID'd as sh files.
933
+ }
934
+ if ($firstline =~ m@^#!\s*\S@) {
935
+ $firstline =~ s/\n.*//s; # Delete everything after first line.
936
+ $warning_from_first_line = "WARNING! File $filename has unknown start: $firstline";
937
+ return "";
938
+ }
939
+ return "";
940
+ }
941
+
942
+
943
+ sub get_file_type {
944
+ my $file_to_examine = shift;
945
+ # Return the given file's type.
946
+ # Consider the file's contents, filename, and file extension.
947
+
948
+ $warning_from_first_line = "";
949
+
950
+ # Skip file names known to not be program files.
951
+ $basename = $file_to_examine;
952
+ $basename =~ s!^.*/!!;
953
+ if ($not_code_filenames{$basename}) {
954
+ print "Note: Skipping non-program filename: $file_to_examine\n"
955
+ if $noisy;
956
+ return "not";
957
+ }
958
+
959
+ # Skip "configure" files if there's a corresponding "configure.in"
960
+ # file; such a situation suggests that "configure" is automatically
961
+ # generated by "autoconf" from "configure.in".
962
+ if (($file_to_examine =~ m!/configure$!) &&
963
+ (-s "${file_to_examine}.in")) {
964
+ print "Note: Auto-generated configure file $file_to_examine\n"
965
+ if $noisy;
966
+ return "auto";
967
+ }
968
+
969
+ if (($basename eq "lex.yy.c") || # Flex/Lex output!
970
+ ($basename eq "lex.yy.cc") || # Flex/Lex output - C++ scanner.
971
+ ($basename eq "y.code.c") || # yacc/bison output.
972
+ ($basename eq "y.tab.c") || # yacc output.
973
+ ($basename eq "y.tab.h")) { # yacc output.
974
+ print "Note: Auto-generated lex/yacc file $file_to_examine\n"
975
+ if $noisy;
976
+ return "auto";
977
+ }
978
+
979
+ # Bison is more flexible than yacc -- it can create arbitrary
980
+ # .c/.h files. If we have a .tab.[ch] file, with a corresponding
981
+ # .y file, then it's been automatically generated.
982
+ # Bison can actually save to any filename, and of course a Makefile
983
+ # can rename any file, but we can't help that.
984
+ if ($basename =~ m/\.tab\.[ch]$/) {
985
+ $possible_bison = $file_to_examine;
986
+ $possible_bison =~ s/\.tab\.[ch]$/\.y/;
987
+ if (-s "$possible_bison") {
988
+ print "Note: found bison-generated file $file_to_examine\n"
989
+ if $noisy;
990
+ return "auto";
991
+ }
992
+ }
993
+
994
+ # If there's a corresponding ".MASTER" file, treat this file
995
+ # as automatically-generated derivative. This handles "exmh".
996
+ if (-s "${file_to_examine}.MASTER") {
997
+ print "Note: Auto-generated non-.MASTER file $file_to_examine\n"
998
+ if $noisy;
999
+ return "auto";
1000
+ }
1001
+
1002
+ # Peek at first line to determine type. Note that the file contents
1003
+ # take precedence over the filename extension, because there are files
1004
+ # (such as /usr/src/redhat/BUILD/teTeX-1.0/texmf/doc/mkhtml.nawk)
1005
+ # which have one extension (say, ".nawk") but actually contain
1006
+ # something else (at least in part):
1007
+ $type = &file_type_from_contents($file_to_examine);
1008
+ if ($type) {
1009
+ return $type;
1010
+ }
1011
+
1012
+ # Use filename to determine if it's a makefile:
1013
+ if (($file_to_examine =~ m/\bmakefile$/i) ||
1014
+ ($file_to_examine =~ m/\bmakefile\.txt$/i) ||
1015
+ ($file_to_examine =~ m/\bmakefile\.pc$/i) ||
1016
+ ($file_to_examine =~ m/\bdebian\/rules$/i)) { # "debian/rules" too.
1017
+ return "makefile";
1018
+ }
1019
+
1020
+ # Try to use filename extension to determine type:
1021
+ if ($file_to_examine =~ m/\.([^.\/]+)$/) {
1022
+ $type = $1;
1023
+
1024
+ # More ugly problems: some source filenames only use
1025
+ # UPPERCASE, and they can be mixed with regular files.
1026
+ # Since normally filenames are lowercase or mixed case,
1027
+ # presume that an all-uppercase filename means we have to assume
1028
+ # that the extension must be lowercased. This particularly affects
1029
+ # .C, which usually means C++ but in this case would mean plain C.
1030
+ my $uppercase_filename = 0;
1031
+ if (($file_to_examine =~ m/[A-Z]/) &&
1032
+ (! ($file_to_examine =~ m/[a-z]/))) {
1033
+ $uppercase_filename = 1;
1034
+ $type = lc($type); # Use lowercase version of type.
1035
+ }
1036
+
1037
+ # Is this type known to NOT be a program?
1038
+ if ($not_code_extensions{$type}) {
1039
+ return "not";
1040
+ }
1041
+
1042
+ # Handle weirdness: ".hpp" is a C/C++ header file, UNLESS it's
1043
+ # makefile.hpp (a makefile); see /usr/src/redhat/BUILD,
1044
+ # pine4.21/pine/makefile.hpp and pine4.21/pico/makefile.hpp
1045
+ # Note that pine also includes pine4.21/pine/osdep/diskquot.hpp.
1046
+ # Kaffe uses .hpp for C++ header files.
1047
+ if (($type eq "hpp") && ($file_to_examine =~ m/makefile\.hpp$/i))
1048
+ {return "makefile";}
1049
+
1050
+ # If it's a C file but there's a ".pc" or ".pgc" file, then presume that
1051
+ # it was automatically generated:
1052
+ if ($type eq "c") {
1053
+ $pc_name = $file_to_examine;
1054
+ if ($uppercase_filename) { $pc_name =~ s/\.C$/\.PC/; }
1055
+ else { $pc_name =~ s/\.c$/\.pc/; }
1056
+ if (-s "$pc_name" ) {
1057
+ print "Note: Auto-generated C file (from .pc file) $file_to_examine\n"
1058
+ if $noisy;
1059
+ return "auto";
1060
+ }
1061
+ $pc_name = $file_to_examine;
1062
+ if ($uppercase_filename) { $pc_name =~ s/\.C$/\.PGC/; }
1063
+ else { $pc_name =~ s/\.c$/\.pgc/; }
1064
+ if (-s "$pc_name" ) {
1065
+ print "Note: Auto-generated C file (from .pgc file) $file_to_examine\n"
1066
+ if $noisy;
1067
+ return "auto";
1068
+ }
1069
+ }
1070
+
1071
+ # ".pc" is the official extension for Oracle C programs with
1072
+ # Embedded C commands, but many programs use ".pc" to indicate
1073
+ # the "PC" (MS-DOS/Windows) version of a file.
1074
+ # We'll use heuristics to detect when it's not really C,
1075
+ # otherwise claim it's C and move on.
1076
+ if ($type eq "pc") { # If it has one of these filenames, it's not C.
1077
+ if ($file_to_examine =~ m/\bmakefile\.pc$/i) { return "makefile"; }
1078
+ if (($file_to_examine =~ m/\bREADME\.pc$/i) ||
1079
+ ($file_to_examine =~ m/\binstall\.pc$/i) ||
1080
+ ($file_to_examine =~ m/\bchanges\.pc$/i)) {return "not";}
1081
+ else { return "c";}
1082
+ }
1083
+
1084
+ if (defined($file_extensions{$type})) {
1085
+ $type = $file_extensions{$type};
1086
+ if ( (($type eq "exp") && (!&really_is_expect($file_to_examine))) ||
1087
+ (($type eq "tk") && (!&really_is_expect($file_to_examine))) ||
1088
+ (($type eq "objc") && (!&really_is_objc($file_to_examine))) ||
1089
+ (($type eq "lex") && (!&really_is_lex($file_to_examine))) ||
1090
+ (($type eq "pascal") && (!&really_is_pascal($file_to_examine)))) {
1091
+ $type = "unknown";
1092
+ } elsif ($type eq "inc") {
1093
+ if (&really_is_php($file_to_examine)) {
1094
+ $type = "php"; # Hey, the .inc is PHP!
1095
+ } elsif (&really_is_incpascal($file_to_examine)) {
1096
+ $type = "pascal";
1097
+ } else {
1098
+ $type = "unknown";
1099
+ }
1100
+ };
1101
+ return $type;
1102
+ }
1103
+
1104
+ }
1105
+ # If we were expecting a script, warn about that.
1106
+ if ($warning_from_first_line) {print "$warning_from_first_line\n";}
1107
+ # Don't know what it is, so report "unknown".
1108
+ return "unknown";
1109
+ }
1110
+
1111
+
1112
+
1113
+
1114
+ sub convert_h_files {
1115
+ # Determine if the ".h" files we saw are C, OBJC, C++, or a mixture (!)
1116
+ # Usually ".hpp" files are C++, but if we didn't see any C++ files then
1117
+ # it probably isn't. This handles situations like pine; its has a file
1118
+ # /usr/src/redhat/BUILD/pine4.21/pine/osdep/diskquot.hpp
1119
+ # where the ".hpp" is for HP, not C++. (Of course, we completely miss
1120
+ # the other files in that pine directory because they have truly bizarre
1121
+ # extensions, but there's no easy way to handle such nonstandard things).
1122
+
1123
+ if (!defined($lang_list_files{"h"})) { return; }
1124
+
1125
+ my $saw_ansic = defined($lang_list_files{"ansic"});
1126
+ my $saw_cpp = defined($lang_list_files{"cpp"});
1127
+ my $saw_objc = defined($lang_list_files{"objc"});
1128
+ my $confidence;
1129
+
1130
+ $lang_list_files{"h"}->close();
1131
+
1132
+ open(H_LIST, "<${dir}/h_list.dat") || die "Can't reopen h_list\n";
1133
+
1134
+ if ($saw_ansic && (!$saw_cpp) && (!$saw_objc)) {
1135
+ # Only C, let's assume .h files are too
1136
+ while (defined($_ = <H_LIST>)) { chomp; force_record_file_type($_, "c"); };
1137
+ } elsif ($saw_cpp && (!$saw_ansic) && (!$saw_objc)) { # Only C++
1138
+ while (defined($_ = <H_LIST>)) { chomp; force_record_file_type($_, "cpp"); };
1139
+ } elsif ($saw_objc && (!$saw_ansic) && (!$saw_cpp)) { # Only Obj-C
1140
+ while (defined($_ = <H_LIST>)) { chomp; force_record_file_type($_, "objc"); };
1141
+ } else {
1142
+ # Ugh, we have a mixture. Let's try to determine what we have, using
1143
+ # various heuristics (looking for a matching name in the directory,
1144
+ # reading the file contents, the contents in the directory, etc.)
1145
+ # When all else fails, assume C.
1146
+ while (defined($_=<H_LIST>)) {
1147
+ chomp;
1148
+ next if (!$_);
1149
+ # print "DEBUG: H file $_\n";
1150
+
1151
+ $h_file = $_;
1152
+ $cpp2_equivalent =
1153
+ $cpp3_equivalent = $cpp4_equivalent = $objc_equivalent = $_;
1154
+ $ansic_equivalent = $cpp_equivalent = $_;
1155
+ $ansic_equivalent =~ s/h$/c/;
1156
+ $cpp_equivalent =~ s/h$/C/;
1157
+ $cpp2_equivalent =~ s/h$/cpp/;
1158
+ $cpp3_equivalent =~ s/h$/cxx/;
1159
+ $cpp4_equivalent =~ s/h$/cc/;
1160
+ $objc_equivalent =~ s/h$/m/;
1161
+ if (m!\.hpp$!) { force_record_file_type($h_file, "cpp"); }
1162
+ elsif ( (-s $cpp2_equivalent) ||
1163
+ (-s $cpp3_equivalent) || (-s $cpp4_equivalent))
1164
+ { force_record_file_type($h_file, "cpp"); }
1165
+ # Note: linuxconf has many ".m" files that match .h files,
1166
+ # but the ".m" files are straight C and _NOT_ objective-C.
1167
+ # The following test handles cases like this:
1168
+ elsif ($saw_objc && (-s $objc_equivalent) &&
1169
+ &really_is_objc($objc_equivalent))
1170
+ { &force_record_file_type($h_file, "objc"); }
1171
+ elsif (( -s $ansic_equivalent) && (! -s $cpp_equivalent))
1172
+ { force_record_file_type($h_file, "c"); }
1173
+ elsif ((-s $cpp_equivalent) && (! -s $ansic_equivalent))
1174
+ { force_record_file_type($h_file, "cpp"); }
1175
+ else {
1176
+ $confidence = &looks_like_cpp($h_file);
1177
+ if ($confidence == 2)
1178
+ { &force_record_file_type($h_file, "cpp"); }
1179
+ else {
1180
+ $files_in_dir = &examine_dir($h_file);
1181
+ if ($files_in_dir eq "cpp")
1182
+ { &force_record_file_type($h_file, "cpp"); }
1183
+ elsif ($files_in_dir eq "objc")
1184
+ { &force_record_file_type($h_file, "objc"); }
1185
+ elsif ($confidence == 1)
1186
+ { &force_record_file_type($h_file, "cpp"); }
1187
+ elsif ($h_file =~ m![a-z][0-9]*\.H$!)
1188
+ # Mixed-case filename, .H extension.
1189
+ { &force_record_file_type($h_file, "cpp"); }
1190
+ else # We're clueless. Let's guess C.
1191
+ { &force_record_file_type($h_file, "c"); };
1192
+ }
1193
+ }
1194
+ }
1195
+ } # Done handling ".h" files.
1196
+ close(H_LIST);
1197
+ }
1198
+
1199
+
1200
+ # MAIN PROGRAM STARTS HERE.
1201
+
1202
+ # Handle options.
1203
+ while (($#ARGV >= 0) && ($ARGV[0] =~ m/^--/)) {
1204
+ $duplicates_okay = 1 if ($ARGV[0] =~ m/^--duplicates$/); # Count duplicates.
1205
+ $crossdups_okay = 1 if ($ARGV[0] =~ m/^--crossdups$/); # Count crossdups.
1206
+ $autogen_okay = 1 if ($ARGV[0] =~ m/^--autogen$/); # Count autogen.
1207
+ $noisy = 1 if ($ARGV[0] =~ m/^--verbose$/); # Verbose output.
1208
+ if ($ARGV[0] =~ m/^--duplistfile$/) { # File to get/record dups.
1209
+ shift;
1210
+ $duplistfile = $ARGV[0];
1211
+ }
1212
+ last if ($ARGV[0] =~ m/^--$/);
1213
+ shift;
1214
+ }
1215
+
1216
+ if ($#ARGV < 0) {
1217
+ print "Error: No directory names given.\n";
1218
+ exit(1);
1219
+ }
1220
+
1221
+ if ($duplistfile) {
1222
+ if (-e $duplistfile) {
1223
+ open(DUPLIST, "<$duplistfile") || die "Can't open $duplistfile";
1224
+ while (defined($_ = <DUPLIST>)) {
1225
+ chomp;
1226
+ ($digest, $filename) = split(/ /, $_, 2);
1227
+ if (defined($digest) && defined($filename)) {
1228
+ $previous_files{$digest} = $filename;
1229
+ }
1230
+ }
1231
+ close(DUPLIST);
1232
+ }
1233
+ open(DUPLIST, ">>$duplistfile") || die "Can't open for writing $duplistfile";
1234
+ }
1235
+
1236
+
1237
+ while ( $dir = shift ) {
1238
+
1239
+ if (! -d "$dir") {
1240
+ print "Skipping non-directory $dir\n";
1241
+ next;
1242
+ }
1243
+
1244
+ if ($examined_directories{$dir}) {
1245
+ print "Skipping already-examined directory $dir\n";
1246
+ next;
1247
+ }
1248
+ $examined_directories{$dir} = 1;
1249
+
1250
+ if (! open(FILELIST, "<${dir}/filelist")) {
1251
+ print "Skipping directory $dir; it doesn't contain a file 'filelist'\n";
1252
+ next;
1253
+ }
1254
+
1255
+ if (-r "${dir}/all-physical.sloc") {
1256
+ # Skip already-analyzed directories; if it's been analyzed, we've already
1257
+ # broken them down.
1258
+ next;
1259
+ }
1260
+
1261
+ if ($crossdups_okay) { # Cross-dups okay; forget the hash of previous files.
1262
+ %previous_files = ();
1263
+ }
1264
+
1265
+ # insert blank lines, in case we need to recover from a midway crash
1266
+ if ($duplistfile) {
1267
+ print DUPLIST "\n";
1268
+ }
1269
+
1270
+
1271
+ $dup_count = 0;
1272
+
1273
+ while (defined($_ = <FILELIST>)) {
1274
+ chomp;
1275
+ $file = $_;
1276
+ next if (!defined($file) || ($file eq ""));
1277
+ if ($file =~ m/\n/) {
1278
+ print STDERR "WARNING! File name contains embedded newline; it'll be IGNORED.\n";
1279
+ print STDERR "Filename is: $file\n";
1280
+ next;
1281
+ }
1282
+ $file_type = &get_file_type($file);
1283
+ if ($file_type) {
1284
+ &record_file_type($file, $file_type);
1285
+ } else {
1286
+ print STDERR "WARNING! No file type selected for $file\n";
1287
+ }
1288
+ }
1289
+
1290
+ # Done with straightline processing. Now we need to determine if
1291
+ # the ".h" files we saw are C, OBJC, C++, or a mixture (!)
1292
+ &convert_h_files();
1293
+
1294
+
1295
+ # Done processing the directory. Close up shop so we're
1296
+ # ready for the next directory.
1297
+
1298
+ close(FILELIST);
1299
+ close_lang_lists();
1300
+ reopen(""); # Close code file.
1301
+
1302
+ if ($dup_count > 50) {
1303
+ print "Warning: in $dir, number of duplicates=$dup_count\n";
1304
+ }
1305
+
1306
+ }
1307
+
1308
+